#!/usr/bin/env python from typing import Dict, Any, Union from datetime import datetime import requests, bs4, re, logging from urllib import parse class imarketGet: def __init__(self,keyword,mode,attrDict): self.startTime = datetime.now() self.keyword = keyword self.url = "https://www.imarket.co.kr/display/malls.do" self.mode = mode if self.mode == "search" : self.row = 100 if self.mode == "category" : self.row = 1000 if self.mode == "SEM" : self.row = 20 self.query = self.query() self.urlterm = self.urlterm() self.bs = self.getHTML() self.prdList = self.getPrdList() self.resultCount = self.getResultCount() self.validRc = self.getValidResultCount() self.AVGprice = self.getAVGprice() self.top3Cate = self.categoryRefine(3) self.attr = attrDict #def __del__(self): # consumtime = datetime.now() - self.startTime # logging.warning(self.keyword + "_imarket : " + str(consumtime)) def query(self): term = self.keyword.encode('euc-kr') return term def urlterm(self): urlterm = parse.quote(self.query) return urlterm def urlParameters(self): data: Dict[str, Union[Union[str, int], Any]] = { 'sc.page': '1', 'sc.row': self.row, 'sc.viewType': 'list' } if self.mode == "search" or self.mode == "SEM" : data['_method'] = 'searchGoods' data['sc.queryText'] = self.query if self.mode == "category" : data['_method'] = '1Depth' data['cateDepth'] = '4' data['sc.shopNo'] = '0000100000' data['sc.dispNo'] = self.query return data def headers(self): headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.imarket.co.kr', 'Pragma': 'no-cache', 'Referer': 'https://www.imarket.co.kr/', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def getHTML(self): resp = requests.get(self.url, params=self.urlParameters(), headers=self.headers()) resp.raise_for_status() resp.encoding = 'EUC-KR' html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') return bs def GetTargetUrl(self): paramz = self.urlParameters() paras = self.attr if 'BIZ_CD' in paras: bizCode = paras['BIZ_CD'] else: bizCode = '1010187' if 'utm_source' in paras: utm_source = paras['utm_source'] else: utm_source = 'naverPowerlink' if 'utm_medium' in paras: utm_medium = paras['utm_medium'] else: utm_medium = 'prdcpc' if 'utm_campaign' in paras: utm_campaign = paras['utm_campaign'] else: utm_campaign = 'test' if 'utm_content' in paras: utm_content = paras['utm_content'] else: utm_content = self.top3Cate['top1CateName'] if 'utm_keyword' in paras: utm_keyword = paras['utm_keyword'] else: utm_keyword = '' addpara = { 'BIZ_CD': bizCode, 'utm_source': utm_source, 'utm_medium': utm_medium, 'utm_campaign': utm_campaign, 'utm_content': utm_content, 'utm_term': utm_keyword #utm_Term이 더 잘 잡힌댄다. 새 규격. } paramz.update(addpara) resp = requests.get(self.url, params=paramz) return resp.url def getResultCount(self): #검색결과수 가져오기 bs = self.bs if self.mode == "search" or self.mode == "SEM" : #호출 모드가 SEM혹은 Search 의 경우 : 즉 검색결과 페이지 rc_module = bs.select('div.tit_category_wrap h2.tit_result span em') if len(rc_module) > 0 : rc = rc_module[0].getText().strip() else : #검색 결과가 없으면 해당 DOM이 존재하지 않아 0을 스트링으로 리턴함. rc = str(0) if self.mode == "category": #카테고리 페이지의 경우 결과수 가져오기 rc = bs.select('div.prd_list_wrap div.sort_wrap span.total em')[0].getText().strip() rc = re.sub(r"\,", "", rc) #천단위 쉼표를 없앤다 return rc def getPrdList(self): #결과 목록 몽땅 불러옵니다. bs = self.bs _list = bs.select('ul.prd_list_type li') return _list def getValidResultCount(self): rc = self.resultCount bs = self.prdList oostckCount = 0 for i in range(len(bs)): if self.stockChk(bs[i]) == "일시품절": oostckCount = int(oostckCount) + 1 if int(rc) <= self.row: vrc = int(rc) - int(oostckCount) else: vrc = int(rc) return [vrc,int(oostckCount)] def getAVGprice(self): #검색결과의 평균 가격을 리턴함. bs = self.prdList priceArray = [] noOpenPriceCount = 0 for i in range(len(bs)): if self.getPrdPrice(bs[i]) == "회원특별가" or self.stockChk(bs[i]) == "일시품절": price = 0 moq = 1 if self.getPrdPrice(bs[i]) == "회원특별가": noOpenPriceCount = noOpenPriceCount + 1 else : price = int(self.getPrdPrice(bs[i])) moq = int(self.getPrdMoq(bs[i])) priceArray.insert(i,price*moq) if len(priceArray) != 0: priceAvg = round(sum(priceArray) / len(priceArray)) else: priceAvg = 0 return [priceAvg,noOpenPriceCount] def categoryRefine(self,cutoff): bs = self.bs categories = {} _category = bs.select('div.filter_wrap ul li dl.category dd ul li a') if len(_category) > 0 : for i in range(len(_category)): categoryName = _category[i].getText().strip() count = _category[i].select('em')[0].getText().strip() categoryName = re.sub(r"\([0-9]+\)", "", categoryName).strip() count = re.sub(r"\(|\)", "", count) categories[categoryName] = int(count) cate_tuple = sorted(categories.items(), key=(lambda x:x[1]), reverse=True) res = dict(cate_tuple) #if int(cutoff) >= len(_category): # cutoff = len(_category) #else : # cutoff = int(cutoff) cutoff = int(cutoff) resAr = {} j = 1 for i in list(res)[0:cutoff]: resAr['top' + str(j) + 'CateName'] = i resAr['top' + str(j) + 'CateCount'] = res.get(i) j = j + 1 if int(cutoff) > len(_category): nullvalues = int(cutoff) - len(_category) startvalue = len(_category) + 1 for k in range(nullvalues): resAr['top' + str(startvalue + k) + 'CateName'] = "" resAr['top' + str(startvalue + k) + 'CateCount'] = "" else : resAr = {} j = 1 for i in range(cutoff) : resAr['top' + str(j) + 'CateName'] = "" resAr['top' + str(j) + 'CateCount'] = "" j = j + 1 return resAr ####### def getPrdCode(self,list_data): prdCode = list_data.select('div.info_box span.prd_code')[0].getText().strip() prdCode = re.sub(r"^상품코드\ \:\ ", "", prdCode) return prdCode def getPrdName(self,list_data): if len(list_data.select('div.info_box a.tit')) < 1: prdName = "error!!!!!!!!!!!!!!!!" else : prdName = list_data.select('div.info_box a.tit')[0].getText().strip() return prdName def getpromoMsg(self,list_data): promo = list_data.select('div.info_box p.prd_promo') if len(promo) > 0: promoMsg = list_data.select('div.info_box p.prd_promo')[0].getText().strip() else: promoMsg = "" return promoMsg def getPrdPrice(self,list_data): #가격정보 #price = list_data.select('div.price_box span.sale_price em.num')[0].getText().strip() price = list_data.select('div.price_box span.sale_price')[0].getText().strip() price = re.sub(r"원$", "", price) price = re.sub(r"\,", "", price) return price def getPrdCouponRate(self,list_data): couponArr = list_data.select('div.price_box span.discount em.num') if len(couponArr) != 0: coupon = couponArr[0].getText().strip() else: coupon = '' return coupon def getPrdMoq(self,list_data): moq = list_data.select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value') return moq def getPrdImg(self,list_data): imgURL = list_data.select('div.img_box a img')[0].get('src') return imgURL def firstTag(self,list_data): imgTagArr = list_data.select('div.img_box a span') imgTags = '' for l in range(len(imgTagArr)): imgTags = imgTags + imgTagArr[l].getText().strip() if l != (len(imgTagArr) - 1): imgTags = imgTags + "," return imgTags def secondTag(self,list_data): tagArr = list_data.select('div.info_box p.info_box02 span') tagData = '' for k in range(len(tagArr)): tagData = tagData + tagArr[k].getText().strip() if k != (len(tagArr) - 1): tagData = tagData + "," return tagData def stockChk(self,list_data): #일시품절여부 가져오기 #outofStock = list_data.select('div.btns a')[0].getText().strip() outofStock = list_data.select('div.outOfStock3 span.txt') if len(outofStock) <= 0: outofStockTxt = "판매중" else: outofStockTxt = outofStock[0].getText().strip() return outofStockTxt def detailedTxt(self,list_data): textArr = list_data.select('div.info_box p.prd_info span') txtdic = {} for j in range(len(textArr)): if j == 0: txtdic['brandName'] = textArr[j].getText().strip() elif j == 1: txtdic['modelnum'] = textArr[j].getText().strip() else : srcTxtArr = textArr[j].getText().strip().split(":") txtdic[srcTxtArr[0].strip()] = srcTxtArr[1].strip() return txtdic ######### def getfullResult(self): rc = self.resultCount res = [] if rc == '0': resdic = {} resdic['term'] = self.keyword resdic['encodedTerm'] = self.urlterm resdic['rc'] = "0" resdic['rank'] = "" resdic['prdCode'] = "" resdic['prdName'] = "" resdic['promoMsg'] = "" resdic['price'] = "" resdic['coupon'] = "" resdic['moq'] = "" resdic['imgTags'] = "" resdic['tagData'] = "" resdic['imgURL'] = "" resdic['outofStock'] = "" resdic['brandName'] = "" resdic['modelnum'] = "" res.append(resdic) else : srcData = self.prdList for i in range(len(srcData)) : resdic = {} resdic['term'] = self.keyword resdic['encodedTerm'] = self.urlterm resdic['rc'] = self.resultCount resdic['rank'] = str(i + 1) resdic['prdCode'] = self.getPrdCode(srcData[i]) resdic['prdName'] = self.getPrdName(srcData[i]) resdic['promoMsg'] = self.getpromoMsg(srcData[i]) resdic['price'] = self.getPrdPrice(srcData[i]) resdic['coupon'] = self.getPrdCouponRate(srcData[i]) resdic['moq'] = self.getPrdMoq(srcData[i]) resdic['imgTags'] = self.firstTag(srcData[i]) resdic['tagData'] = self.secondTag(srcData[i]) resdic['imgURL'] = self.getPrdImg(srcData[i]) resdic['outofStock'] = self.stockChk(srcData[i]) resdic.update(self.detailedTxt(srcData[i])) res.append(resdic) return res def getSummaryResult(self): if self.resultCount == '0': resdic = {} resdic['term'] = self.keyword resdic['rc'] = 0 resdic['validrc'] = 0 resdic['outofStockCount'] = 0 #resdic['sp_priceCount'] = 0 resdic['priceAvg'] = 0 resdic.update(self.top3Cate) resdic['targetUrl'] = self.GetTargetUrl() else: resdic = {} resdic['term'] = self.keyword resdic['rc'] = self.resultCount resdic['validrc'] = self.validRc[0] resdic['outofStockCount'] = self.validRc[1] #resdic['sp_priceCount'] = self.AVGprice[1] resdic['priceAvg'] = self.AVGprice[0] resdic.update(self.top3Cate) resdic['targetUrl'] = self.GetTargetUrl() return resdic