364 lines
14 KiB
Python
364 lines
14 KiB
Python
#!/usr/bin/env python
|
|
from typing import Dict, Any, Union
|
|
from datetime import datetime
|
|
import requests, bs4, re, logging
|
|
from urllib import parse
|
|
|
|
class imarketGet:
|
|
|
|
def __init__(self,keyword,mode,attrDict):
|
|
self.startTime = datetime.now()
|
|
self.keyword = keyword
|
|
self.url = "https://www.imarket.co.kr/display/malls.do"
|
|
self.mode = mode
|
|
if self.mode == "search" :
|
|
self.row = 100
|
|
if self.mode == "category" :
|
|
self.row = 1000
|
|
if self.mode == "SEM" :
|
|
self.row = 20
|
|
self.query = self.query()
|
|
self.urlterm = self.urlterm()
|
|
self.bs = self.getHTML()
|
|
self.prdList = self.getPrdList()
|
|
self.resultCount = self.getResultCount()
|
|
self.validRc = self.getValidResultCount()
|
|
self.AVGprice = self.getAVGprice()
|
|
self.top3Cate = self.categoryRefine(3)
|
|
self.attr = attrDict
|
|
|
|
#def __del__(self):
|
|
# consumtime = datetime.now() - self.startTime
|
|
# logging.warning(self.keyword + "_imarket : " + str(consumtime))
|
|
|
|
def query(self):
|
|
term = self.keyword.encode('euc-kr')
|
|
return term
|
|
|
|
def urlterm(self):
|
|
urlterm = parse.quote(self.query)
|
|
return urlterm
|
|
|
|
def urlParameters(self):
|
|
data: Dict[str, Union[Union[str, int], Any]] = {
|
|
'sc.page': '1',
|
|
'sc.row': self.row,
|
|
'sc.viewType': 'list'
|
|
}
|
|
if self.mode == "search" or self.mode == "SEM" :
|
|
data['_method'] = 'searchGoods'
|
|
data['sc.queryText'] = self.query
|
|
if self.mode == "category" :
|
|
data['_method'] = '1Depth'
|
|
data['cateDepth'] = '4'
|
|
data['sc.shopNo'] = '0000100000'
|
|
data['sc.dispNo'] = self.query
|
|
return data
|
|
|
|
def headers(self):
|
|
headers = {
|
|
'Content-Type': 'application/x-www-form-urlencoded',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Host': 'www.imarket.co.kr',
|
|
'Pragma': 'no-cache',
|
|
'Referer': 'https://www.imarket.co.kr/',
|
|
'Save-Data': 'on',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
|
|
}
|
|
return headers
|
|
|
|
def getHTML(self):
|
|
resp = requests.get(self.url, params=self.urlParameters(), headers=self.headers())
|
|
resp.raise_for_status()
|
|
resp.encoding = 'EUC-KR'
|
|
html = resp.text
|
|
bs = bs4.BeautifulSoup(html, 'html.parser')
|
|
return bs
|
|
|
|
def GetTargetUrl(self):
|
|
paramz = self.urlParameters()
|
|
paras = self.attr
|
|
if 'BIZ_CD' in paras:
|
|
bizCode = paras['BIZ_CD']
|
|
else:
|
|
bizCode = '1010187'
|
|
if 'utm_source' in paras:
|
|
utm_source = paras['utm_source']
|
|
else:
|
|
utm_source = 'naverPowerlink'
|
|
if 'utm_medium' in paras:
|
|
utm_medium = paras['utm_medium']
|
|
else:
|
|
utm_medium = 'prdcpc'
|
|
if 'utm_campaign' in paras:
|
|
utm_campaign = paras['utm_campaign']
|
|
else:
|
|
utm_campaign = 'test'
|
|
if 'utm_content' in paras:
|
|
utm_content = paras['utm_content']
|
|
else:
|
|
utm_content = self.top3Cate['top1CateName']
|
|
if 'utm_keyword' in paras:
|
|
utm_keyword = paras['utm_keyword']
|
|
else:
|
|
utm_keyword = ''
|
|
addpara = {
|
|
'BIZ_CD': bizCode,
|
|
'utm_source': utm_source,
|
|
'utm_medium': utm_medium,
|
|
'utm_campaign': utm_campaign,
|
|
'utm_content': utm_content,
|
|
'utm_term': utm_keyword #utm_Term이 더 잘 잡힌댄다. 새 규격.
|
|
}
|
|
paramz.update(addpara)
|
|
resp = requests.get(self.url, params=paramz)
|
|
return resp.url
|
|
|
|
def getResultCount(self): #검색결과수 가져오기
|
|
bs = self.bs
|
|
if self.mode == "search" or self.mode == "SEM" : #호출 모드가 SEM혹은 Search 의 경우 : 즉 검색결과 페이지
|
|
rc_module = bs.select('div.tit_category_wrap h2.tit_result span em')
|
|
if len(rc_module) > 0 :
|
|
rc = rc_module[0].getText().strip()
|
|
else : #검색 결과가 없으면 해당 DOM이 존재하지 않아 0을 스트링으로 리턴함.
|
|
rc = str(0)
|
|
if self.mode == "category": #카테고리 페이지의 경우 결과수 가져오기
|
|
rc = bs.select('div.prd_list_wrap div.sort_wrap span.total em')[0].getText().strip()
|
|
rc = re.sub(r"\,", "", rc) #천단위 쉼표를 없앤다
|
|
return rc
|
|
|
|
def getPrdList(self): #결과 목록 몽땅 불러옵니다.
|
|
bs = self.bs
|
|
_list = bs.select('ul.prd_list_type li')
|
|
return _list
|
|
|
|
def getValidResultCount(self):
|
|
rc = self.resultCount
|
|
bs = self.prdList
|
|
oostckCount = 0
|
|
for i in range(len(bs)):
|
|
if self.stockChk(bs[i]) == "일시품절":
|
|
oostckCount = int(oostckCount) + 1
|
|
if int(rc) <= self.row:
|
|
vrc = int(rc) - int(oostckCount)
|
|
else:
|
|
vrc = int(rc)
|
|
return [vrc,int(oostckCount)]
|
|
|
|
def getAVGprice(self): #검색결과의 평균 가격을 리턴함.
|
|
bs = self.prdList
|
|
priceArray = []
|
|
noOpenPriceCount = 0
|
|
for i in range(len(bs)):
|
|
if self.getPrdPrice(bs[i]) == "회원특별가" or self.stockChk(bs[i]) == "일시품절":
|
|
price = 0
|
|
moq = 1
|
|
if self.getPrdPrice(bs[i]) == "회원특별가":
|
|
noOpenPriceCount = noOpenPriceCount + 1
|
|
else :
|
|
price = int(self.getPrdPrice(bs[i]))
|
|
moq = int(self.getPrdMoq(bs[i]))
|
|
priceArray.insert(i,price*moq)
|
|
if len(priceArray) != 0:
|
|
priceAvg = round(sum(priceArray) / len(priceArray))
|
|
else:
|
|
priceAvg = 0
|
|
return [priceAvg,noOpenPriceCount]
|
|
|
|
def categoryRefine(self,cutoff):
|
|
bs = self.bs
|
|
categories = {}
|
|
_category = bs.select('div.filter_wrap ul li dl.category dd ul li a')
|
|
if len(_category) > 0 :
|
|
for i in range(len(_category)):
|
|
categoryName = _category[i].getText().strip()
|
|
count = _category[i].select('em')[0].getText().strip()
|
|
categoryName = re.sub(r"\([0-9]+\)", "", categoryName).strip()
|
|
count = re.sub(r"\(|\)", "", count)
|
|
categories[categoryName] = int(count)
|
|
cate_tuple = sorted(categories.items(), key=(lambda x:x[1]), reverse=True)
|
|
res = dict(cate_tuple)
|
|
#if int(cutoff) >= len(_category):
|
|
# cutoff = len(_category)
|
|
#else :
|
|
# cutoff = int(cutoff)
|
|
cutoff = int(cutoff)
|
|
resAr = {}
|
|
j = 1
|
|
for i in list(res)[0:cutoff]:
|
|
resAr['top' + str(j) + 'CateName'] = i
|
|
resAr['top' + str(j) + 'CateCount'] = res.get(i)
|
|
j = j + 1
|
|
if int(cutoff) > len(_category):
|
|
nullvalues = int(cutoff) - len(_category)
|
|
startvalue = len(_category) + 1
|
|
for k in range(nullvalues):
|
|
resAr['top' + str(startvalue + k) + 'CateName'] = ""
|
|
resAr['top' + str(startvalue + k) + 'CateCount'] = ""
|
|
else :
|
|
resAr = {}
|
|
j = 1
|
|
for i in range(cutoff) :
|
|
resAr['top' + str(j) + 'CateName'] = ""
|
|
resAr['top' + str(j) + 'CateCount'] = ""
|
|
j = j + 1
|
|
return resAr
|
|
|
|
#######
|
|
|
|
def getPrdCode(self,list_data):
|
|
prdCode = list_data.select('div.info_box span.prd_code')[0].getText().strip()
|
|
prdCode = re.sub(r"^상품코드\ \:\ ", "", prdCode)
|
|
return prdCode
|
|
|
|
def getPrdName(self,list_data):
|
|
if len(list_data.select('div.info_box a.tit')) < 1:
|
|
prdName = "error!!!!!!!!!!!!!!!!"
|
|
else :
|
|
prdName = list_data.select('div.info_box a.tit')[0].getText().strip()
|
|
return prdName
|
|
|
|
def getpromoMsg(self,list_data):
|
|
promo = list_data.select('div.info_box p.prd_promo')
|
|
if len(promo) > 0:
|
|
promoMsg = list_data.select('div.info_box p.prd_promo')[0].getText().strip()
|
|
else:
|
|
promoMsg = ""
|
|
return promoMsg
|
|
|
|
def getPrdPrice(self,list_data): #가격정보
|
|
#price = list_data.select('div.price_box span.sale_price em.num')[0].getText().strip()
|
|
price = list_data.select('div.price_box span.sale_price')[0].getText().strip()
|
|
price = re.sub(r"원$", "", price)
|
|
price = re.sub(r"\,", "", price)
|
|
return price
|
|
|
|
def getPrdCouponRate(self,list_data):
|
|
couponArr = list_data.select('div.price_box span.discount em.num')
|
|
if len(couponArr) != 0:
|
|
coupon = couponArr[0].getText().strip()
|
|
else:
|
|
coupon = ''
|
|
return coupon
|
|
|
|
def getPrdMoq(self,list_data):
|
|
moq = list_data.select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value')
|
|
return moq
|
|
|
|
def getPrdImg(self,list_data):
|
|
imgURL = list_data.select('div.img_box a img')[0].get('src')
|
|
return imgURL
|
|
|
|
def firstTag(self,list_data):
|
|
imgTagArr = list_data.select('div.img_box a span')
|
|
imgTags = ''
|
|
for l in range(len(imgTagArr)):
|
|
imgTags = imgTags + imgTagArr[l].getText().strip()
|
|
if l != (len(imgTagArr) - 1):
|
|
imgTags = imgTags + ","
|
|
return imgTags
|
|
|
|
def secondTag(self,list_data):
|
|
tagArr = list_data.select('div.info_box p.info_box02 span')
|
|
tagData = ''
|
|
for k in range(len(tagArr)):
|
|
tagData = tagData + tagArr[k].getText().strip()
|
|
if k != (len(tagArr) - 1):
|
|
tagData = tagData + ","
|
|
return tagData
|
|
|
|
def stockChk(self,list_data): #일시품절여부 가져오기
|
|
#outofStock = list_data.select('div.btns a')[0].getText().strip()
|
|
outofStock = list_data.select('div.outOfStock3 span.txt')
|
|
if len(outofStock) <= 0:
|
|
outofStockTxt = "판매중"
|
|
else:
|
|
outofStockTxt = outofStock[0].getText().strip()
|
|
return outofStockTxt
|
|
|
|
def detailedTxt(self,list_data):
|
|
textArr = list_data.select('div.info_box p.prd_info span')
|
|
txtdic = {}
|
|
for j in range(len(textArr)):
|
|
if j == 0:
|
|
txtdic['brandName'] = textArr[j].getText().strip()
|
|
elif j == 1:
|
|
txtdic['modelnum'] = textArr[j].getText().strip()
|
|
else :
|
|
srcTxtArr = textArr[j].getText().strip().split(":")
|
|
txtdic[srcTxtArr[0].strip()] = srcTxtArr[1].strip()
|
|
return txtdic
|
|
|
|
#########
|
|
|
|
def getfullResult(self):
|
|
rc = self.resultCount
|
|
res = []
|
|
if rc == '0':
|
|
resdic = {}
|
|
resdic['term'] = self.keyword
|
|
resdic['encodedTerm'] = self.urlterm
|
|
resdic['rc'] = "0"
|
|
resdic['rank'] = ""
|
|
resdic['prdCode'] = ""
|
|
resdic['prdName'] = ""
|
|
resdic['promoMsg'] = ""
|
|
resdic['price'] = ""
|
|
resdic['coupon'] = ""
|
|
resdic['moq'] = ""
|
|
resdic['imgTags'] = ""
|
|
resdic['tagData'] = ""
|
|
resdic['imgURL'] = ""
|
|
resdic['outofStock'] = ""
|
|
resdic['brandName'] = ""
|
|
resdic['modelnum'] = ""
|
|
res.append(resdic)
|
|
else :
|
|
srcData = self.prdList
|
|
for i in range(len(srcData)) :
|
|
resdic = {}
|
|
resdic['term'] = self.keyword
|
|
resdic['encodedTerm'] = self.urlterm
|
|
resdic['rc'] = self.resultCount
|
|
resdic['rank'] = str(i + 1)
|
|
resdic['prdCode'] = self.getPrdCode(srcData[i])
|
|
resdic['prdName'] = self.getPrdName(srcData[i])
|
|
resdic['promoMsg'] = self.getpromoMsg(srcData[i])
|
|
resdic['price'] = self.getPrdPrice(srcData[i])
|
|
resdic['coupon'] = self.getPrdCouponRate(srcData[i])
|
|
resdic['moq'] = self.getPrdMoq(srcData[i])
|
|
resdic['imgTags'] = self.firstTag(srcData[i])
|
|
resdic['tagData'] = self.secondTag(srcData[i])
|
|
resdic['imgURL'] = self.getPrdImg(srcData[i])
|
|
resdic['outofStock'] = self.stockChk(srcData[i])
|
|
resdic.update(self.detailedTxt(srcData[i]))
|
|
res.append(resdic)
|
|
return res
|
|
|
|
def getSummaryResult(self):
|
|
if self.resultCount == '0':
|
|
resdic = {}
|
|
resdic['term'] = self.keyword
|
|
resdic['rc'] = 0
|
|
resdic['validrc'] = 0
|
|
resdic['outofStockCount'] = 0
|
|
#resdic['sp_priceCount'] = 0
|
|
resdic['priceAvg'] = 0
|
|
resdic.update(self.top3Cate)
|
|
resdic['targetUrl'] = self.GetTargetUrl()
|
|
else:
|
|
resdic = {}
|
|
resdic['term'] = self.keyword
|
|
resdic['rc'] = self.resultCount
|
|
resdic['validrc'] = self.validRc[0]
|
|
resdic['outofStockCount'] = self.validRc[1]
|
|
#resdic['sp_priceCount'] = self.AVGprice[1]
|
|
resdic['priceAvg'] = self.AVGprice[0]
|
|
resdic.update(self.top3Cate)
|
|
resdic['targetUrl'] = self.GetTargetUrl()
|
|
return resdic |