#!/Users/maddiekorea/Workspace/bin/python #version 201904 import requests, bs4, urllib, sys, re, math, logging from urllib import parse from datetime import datetime startTime = datetime.now() url = "https://www.imarket.co.kr/display/malls.do" def query(keyword) : query = keyword.encode('euc-kr') return query def parameters( page, query ) : data = { '_method': 'searchGoods', 'sc.page': page, 'sc.row': '200', 'sc.viewType': 'list', 'sc.queryText': query } return data def headers() : headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.imarket.co.kr', 'Pragma': 'no-cache', 'Referer': 'http://www.imarket.co.kr/', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def pageRequest( url, parameters, headers ) : resp = requests.get(url, params = parameters, headers = headers) resp.raise_for_status() resp.encoding='EUC-KR' html = resp.text return html term = str(sys.argv[1]) urlterm = parse.quote(query(term)) htmlHead = pageRequest(url, parameters(1,query(term)), headers()) bs = bs4.BeautifulSoup(htmlHead, 'html.parser') rc = bs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip() rc = re.sub(r"\,","",rc) if rc == '0' : print(term + "\t" + urlterm + "\t" + "NoResult") _list = bs.select('ul.prd_list_type li') for i in range(len(_list)) : prdCode = _list[i].select('div.info_box span.prd_code')[0].getText().strip() prdCode = re.sub(r"^상품코드\ \:\ ","",prdCode) if len(_list[i].select('div.info_box a.tit')) < 1 : logging.error(term + " : " + str(i+1)) prdName = _list[i].select('div.info_box a.tit')[0].getText().strip() promoMsg = _list[i].select('div.info_box p.prd_promo')[0].getText().strip() price = _list[i].select('div.price_box span.sale_price em.num')[0].getText().strip() price = re.sub(r"\,","",price) couponArr = _list[i].select('div.price_box span.discount em.num') if len(couponArr) != 0 : coupon = couponArr[0].getText().strip() else : coupon = ''; moq = _list[i].select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value') imgURL = _list[i].select('div.img_box a img')[0].get('src') imgTagArr = _list[i].select('div.img_box a span') imgTags = '' for l in range(len(imgTagArr)) : imgTags = imgTags + imgTagArr[l].getText().strip() if l != (len(imgTagArr)-1) : imgTags = imgTags + "," tagArr = _list[i].select('div.info_box p.info_box02 span') tagData = '' for k in range(len(tagArr)) : tagData = tagData + tagArr[k].getText().strip() if k != (len(tagArr)-1) : tagData = tagData + "," outofStock = _list[i].select('div.btns a')[0].getText().strip() if outofStock == "장바구니" : outofStock = "판매중" textArr = _list[i].select('div.info_box p.prd_info span') txtdata = '' for j in range(len(textArr)) : txtdata = txtdata + textArr[j].getText().strip() if j != (len(textArr)-1) : txtdata = txtdata + "\t" print(term + "\t" + urlterm + "\t" + str(rc) + "\t" + str(i+1) + "\t" + prdCode + "\t" + prdName + "\t" + promoMsg + "\t" + price + "\t" + coupon + "\t" + moq + "\t" + imgTags + "\t" + tagData + "\t" + imgURL + "\t" + outofStock + "\t" + txtdata ) consumtime = datetime.now() - startTime logging.warning(term + "\t" + str(consumtime))