python_apps/crwlers/lib/imarketlib.py
2023-11-03 14:49:12 +09:00

364 lines
14 KiB
Python

#!/usr/bin/env python
from typing import Dict, Any, Union
from datetime import datetime
import requests, bs4, re, logging
from urllib import parse
class imarketGet:
def __init__(self,keyword,mode,attrDict):
self.startTime = datetime.now()
self.keyword = keyword
self.url = "https://www.imarket.co.kr/display/malls.do"
self.mode = mode
if self.mode == "search" :
self.row = 100
if self.mode == "category" :
self.row = 1000
if self.mode == "SEM" :
self.row = 20
self.query = self.query()
self.urlterm = self.urlterm()
self.bs = self.getHTML()
self.prdList = self.getPrdList()
self.resultCount = self.getResultCount()
self.validRc = self.getValidResultCount()
self.AVGprice = self.getAVGprice()
self.top3Cate = self.categoryRefine(3)
self.attr = attrDict
#def __del__(self):
# consumtime = datetime.now() - self.startTime
# logging.warning(self.keyword + "_imarket : " + str(consumtime))
def query(self):
term = self.keyword.encode('euc-kr')
return term
def urlterm(self):
urlterm = parse.quote(self.query)
return urlterm
def urlParameters(self):
data: Dict[str, Union[Union[str, int], Any]] = {
'sc.page': '1',
'sc.row': self.row,
'sc.viewType': 'list'
}
if self.mode == "search" or self.mode == "SEM" :
data['_method'] = 'searchGoods'
data['sc.queryText'] = self.query
if self.mode == "category" :
data['_method'] = '1Depth'
data['cateDepth'] = '4'
data['sc.shopNo'] = '0000100000'
data['sc.dispNo'] = self.query
return data
def headers(self):
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.imarket.co.kr',
'Pragma': 'no-cache',
'Referer': 'https://www.imarket.co.kr/',
'Save-Data': 'on',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
return headers
def getHTML(self):
resp = requests.get(self.url, params=self.urlParameters(), headers=self.headers())
resp.raise_for_status()
resp.encoding = 'EUC-KR'
html = resp.text
bs = bs4.BeautifulSoup(html, 'html.parser')
return bs
def GetTargetUrl(self):
paramz = self.urlParameters()
paras = self.attr
if 'BIZ_CD' in paras:
bizCode = paras['BIZ_CD']
else:
bizCode = '1010187'
if 'utm_source' in paras:
utm_source = paras['utm_source']
else:
utm_source = 'naverPowerlink'
if 'utm_medium' in paras:
utm_medium = paras['utm_medium']
else:
utm_medium = 'prdcpc'
if 'utm_campaign' in paras:
utm_campaign = paras['utm_campaign']
else:
utm_campaign = 'test'
if 'utm_content' in paras:
utm_content = paras['utm_content']
else:
utm_content = self.top3Cate['top1CateName']
if 'utm_keyword' in paras:
utm_keyword = paras['utm_keyword']
else:
utm_keyword = ''
addpara = {
'BIZ_CD': bizCode,
'utm_source': utm_source,
'utm_medium': utm_medium,
'utm_campaign': utm_campaign,
'utm_content': utm_content,
'utm_term': utm_keyword #utm_Term이 더 잘 잡힌댄다. 새 규격.
}
paramz.update(addpara)
resp = requests.get(self.url, params=paramz)
return resp.url
def getResultCount(self): #검색결과수 가져오기
bs = self.bs
if self.mode == "search" or self.mode == "SEM" : #호출 모드가 SEM혹은 Search 의 경우 : 즉 검색결과 페이지
rc_module = bs.select('div.tit_category_wrap h2.tit_result span em')
if len(rc_module) > 0 :
rc = rc_module[0].getText().strip()
else : #검색 결과가 없으면 해당 DOM이 존재하지 않아 0을 스트링으로 리턴함.
rc = str(0)
if self.mode == "category": #카테고리 페이지의 경우 결과수 가져오기
rc = bs.select('div.prd_list_wrap div.sort_wrap span.total em')[0].getText().strip()
rc = re.sub(r"\,", "", rc) #천단위 쉼표를 없앤다
return rc
def getPrdList(self): #결과 목록 몽땅 불러옵니다.
bs = self.bs
_list = bs.select('ul.prd_list_type li')
return _list
def getValidResultCount(self):
rc = self.resultCount
bs = self.prdList
oostckCount = 0
for i in range(len(bs)):
if self.stockChk(bs[i]) == "일시품절":
oostckCount = int(oostckCount) + 1
if int(rc) <= self.row:
vrc = int(rc) - int(oostckCount)
else:
vrc = int(rc)
return [vrc,int(oostckCount)]
def getAVGprice(self): #검색결과의 평균 가격을 리턴함.
bs = self.prdList
priceArray = []
noOpenPriceCount = 0
for i in range(len(bs)):
if self.getPrdPrice(bs[i]) == "회원특별가" or self.stockChk(bs[i]) == "일시품절":
price = 0
moq = 1
if self.getPrdPrice(bs[i]) == "회원특별가":
noOpenPriceCount = noOpenPriceCount + 1
else :
price = int(self.getPrdPrice(bs[i]))
moq = int(self.getPrdMoq(bs[i]))
priceArray.insert(i,price*moq)
if len(priceArray) != 0:
priceAvg = round(sum(priceArray) / len(priceArray))
else:
priceAvg = 0
return [priceAvg,noOpenPriceCount]
def categoryRefine(self,cutoff):
bs = self.bs
categories = {}
_category = bs.select('div.filter_wrap ul li dl.category dd ul li a')
if len(_category) > 0 :
for i in range(len(_category)):
categoryName = _category[i].getText().strip()
count = _category[i].select('em')[0].getText().strip()
categoryName = re.sub(r"\([0-9]+\)", "", categoryName).strip()
count = re.sub(r"\(|\)", "", count)
categories[categoryName] = int(count)
cate_tuple = sorted(categories.items(), key=(lambda x:x[1]), reverse=True)
res = dict(cate_tuple)
#if int(cutoff) >= len(_category):
# cutoff = len(_category)
#else :
# cutoff = int(cutoff)
cutoff = int(cutoff)
resAr = {}
j = 1
for i in list(res)[0:cutoff]:
resAr['top' + str(j) + 'CateName'] = i
resAr['top' + str(j) + 'CateCount'] = res.get(i)
j = j + 1
if int(cutoff) > len(_category):
nullvalues = int(cutoff) - len(_category)
startvalue = len(_category) + 1
for k in range(nullvalues):
resAr['top' + str(startvalue + k) + 'CateName'] = ""
resAr['top' + str(startvalue + k) + 'CateCount'] = ""
else :
resAr = {}
j = 1
for i in range(cutoff) :
resAr['top' + str(j) + 'CateName'] = ""
resAr['top' + str(j) + 'CateCount'] = ""
j = j + 1
return resAr
#######
def getPrdCode(self,list_data):
prdCode = list_data.select('div.info_box span.prd_code')[0].getText().strip()
prdCode = re.sub(r"^상품코드\ \:\ ", "", prdCode)
return prdCode
def getPrdName(self,list_data):
if len(list_data.select('div.info_box a.tit')) < 1:
prdName = "error!!!!!!!!!!!!!!!!"
else :
prdName = list_data.select('div.info_box a.tit')[0].getText().strip()
return prdName
def getpromoMsg(self,list_data):
promo = list_data.select('div.info_box p.prd_promo')
if len(promo) > 0:
promoMsg = list_data.select('div.info_box p.prd_promo')[0].getText().strip()
else:
promoMsg = ""
return promoMsg
def getPrdPrice(self,list_data): #가격정보
#price = list_data.select('div.price_box span.sale_price em.num')[0].getText().strip()
price = list_data.select('div.price_box span.sale_price')[0].getText().strip()
price = re.sub(r"원$", "", price)
price = re.sub(r"\,", "", price)
return price
def getPrdCouponRate(self,list_data):
couponArr = list_data.select('div.price_box span.discount em.num')
if len(couponArr) != 0:
coupon = couponArr[0].getText().strip()
else:
coupon = ''
return coupon
def getPrdMoq(self,list_data):
moq = list_data.select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value')
return moq
def getPrdImg(self,list_data):
imgURL = list_data.select('div.img_box a img')[0].get('src')
return imgURL
def firstTag(self,list_data):
imgTagArr = list_data.select('div.img_box a span')
imgTags = ''
for l in range(len(imgTagArr)):
imgTags = imgTags + imgTagArr[l].getText().strip()
if l != (len(imgTagArr) - 1):
imgTags = imgTags + ","
return imgTags
def secondTag(self,list_data):
tagArr = list_data.select('div.info_box p.info_box02 span')
tagData = ''
for k in range(len(tagArr)):
tagData = tagData + tagArr[k].getText().strip()
if k != (len(tagArr) - 1):
tagData = tagData + ","
return tagData
def stockChk(self,list_data): #일시품절여부 가져오기
#outofStock = list_data.select('div.btns a')[0].getText().strip()
outofStock = list_data.select('div.outOfStock3 span.txt')
if len(outofStock) <= 0:
outofStockTxt = "판매중"
else:
outofStockTxt = outofStock[0].getText().strip()
return outofStockTxt
def detailedTxt(self,list_data):
textArr = list_data.select('div.info_box p.prd_info span')
txtdic = {}
for j in range(len(textArr)):
if j == 0:
txtdic['brandName'] = textArr[j].getText().strip()
elif j == 1:
txtdic['modelnum'] = textArr[j].getText().strip()
else :
srcTxtArr = textArr[j].getText().strip().split(":")
txtdic[srcTxtArr[0].strip()] = srcTxtArr[1].strip()
return txtdic
#########
def getfullResult(self):
rc = self.resultCount
res = []
if rc == '0':
resdic = {}
resdic['term'] = self.keyword
resdic['encodedTerm'] = self.urlterm
resdic['rc'] = "0"
resdic['rank'] = ""
resdic['prdCode'] = ""
resdic['prdName'] = ""
resdic['promoMsg'] = ""
resdic['price'] = ""
resdic['coupon'] = ""
resdic['moq'] = ""
resdic['imgTags'] = ""
resdic['tagData'] = ""
resdic['imgURL'] = ""
resdic['outofStock'] = ""
resdic['brandName'] = ""
resdic['modelnum'] = ""
res.append(resdic)
else :
srcData = self.prdList
for i in range(len(srcData)) :
resdic = {}
resdic['term'] = self.keyword
resdic['encodedTerm'] = self.urlterm
resdic['rc'] = self.resultCount
resdic['rank'] = str(i + 1)
resdic['prdCode'] = self.getPrdCode(srcData[i])
resdic['prdName'] = self.getPrdName(srcData[i])
resdic['promoMsg'] = self.getpromoMsg(srcData[i])
resdic['price'] = self.getPrdPrice(srcData[i])
resdic['coupon'] = self.getPrdCouponRate(srcData[i])
resdic['moq'] = self.getPrdMoq(srcData[i])
resdic['imgTags'] = self.firstTag(srcData[i])
resdic['tagData'] = self.secondTag(srcData[i])
resdic['imgURL'] = self.getPrdImg(srcData[i])
resdic['outofStock'] = self.stockChk(srcData[i])
resdic.update(self.detailedTxt(srcData[i]))
res.append(resdic)
return res
def getSummaryResult(self):
if self.resultCount == '0':
resdic = {}
resdic['term'] = self.keyword
resdic['rc'] = 0
resdic['validrc'] = 0
resdic['outofStockCount'] = 0
#resdic['sp_priceCount'] = 0
resdic['priceAvg'] = 0
resdic.update(self.top3Cate)
resdic['targetUrl'] = self.GetTargetUrl()
else:
resdic = {}
resdic['term'] = self.keyword
resdic['rc'] = self.resultCount
resdic['validrc'] = self.validRc[0]
resdic['outofStockCount'] = self.validRc[1]
#resdic['sp_priceCount'] = self.AVGprice[1]
resdic['priceAvg'] = self.AVGprice[0]
resdic.update(self.top3Cate)
resdic['targetUrl'] = self.GetTargetUrl()
return resdic