#version 201904 import requests, bs4, urllib, sys, re, math, logging, pymysql from urllib import parse from datetime import datetime startTime = datetime.now() conn = pymysql.connect(host='localhost', user='root', password='dlsxjvkzmdkdlakzpt!',db='crawler', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8') url = "http://www.imarket.co.kr/display/malls.do" def query(keyword) : query = keyword.encode('euc-kr') return query def parameters( page, query ) : data = { '_method': 'searchGoods', 'sc.page': page, 'sc.row': '20', 'sc.viewType': 'list', 'sc.queryText': query } return data def headers() : headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'www.imarket.co.kr', 'Pragma': 'no-cache', 'Referer': 'http://www.imarket.co.kr/', 'Save-Data': 'on', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } return headers def pageRequest( url, parameters, headers ) : resp = requests.get(url, params = parameters, headers = headers) resp.raise_for_status() resp.encoding='EUC-KR' html = resp.text return html # sys.argv[1] 오리지널 텀 # sys.argv[2] replaced term for naver # sys.argv[3] utm campaign # sys.argv[4] utm content utm_campaign = 'test' utm_content = '' sp = '' term = str(sys.argv[1]) if len(sys.argv) == 2: if term.find(" ") == -1 : packed = term else : packed = re.sub(r"\ ","",term) elif len(sys.argv) > 2 : if sys.argv[2] == '-' or sys.argv[2] == '' : if term.find(" ") == -1 : packed = term else : packed = re.sub(r"\ ","",term) else : packed = re.sub(r"\ ","",sys.argv[2]) if len(sys.argv) == 4 : if sys.argv[3] : utm_campaign = str(sys.argv[3]) if len(sys.argv) == 5 : if sys.argv[3] : utm_campaign = str(sys.argv[3]) if sys.argv[4] : utm_content = str(sys.argv[4]) if term != packed : if re.sub(r"\ ","",term) == re.sub(r"\ ","",packed) : sp = "sp" else : sp = "df" urlterm = parse.quote(query(term)) htmlHead = pageRequest(url, parameters(1,query(term)), headers()) bs = bs4.BeautifulSoup(htmlHead, 'html.parser') rc = bs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip() rc = re.sub(r"\,","",rc) if rc == '0' : print(term + "\t" + packed + "\t" + "NoResult") cursNR = conn.cursor() deleteNRSql = "DELETE FROM `naverpowerlinkterms` WHERE term = \"" + term + "\" AND PWTerm = \"" + packed + "\"" cursNR.execute(deleteNRSql) conn.commit() curs_NR = conn.cursor() delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\" AND PWTerm =\"" + packed + "\"" curs_NR.execute(delete_NRSql) conn.commit() cursNRIN = conn.cursor() insetNRSql = "INSERT INTO `noresults` VALUES(NULL,'" + term + "','" + utm_campaign + "',NOW(),'" + packed + "')" cursNRIN.execute(insetNRSql) conn.commit() conn.close() else : _list = bs.select('ul.prd_list_type li') outofStockCount = 0 priceArray = [] sp_priceCount = 0 for i in range(len(_list)) : price = _list[i].select('div.price_box span.sale_price em.num')[0].getText().strip() price = re.sub(r"\,","",price) if price == '특별할인가' : price = 0 sp_priceCount = sp_priceCount + 1 moq = _list[i].select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value') outofStock = _list[i].select('div.btns a')[0].getText().strip() if price != 0 and outofStock == '장바구니' : priceArray.insert(i,int(price)*int(moq)) if outofStock == "일시품절" : outofStockCount = outofStockCount + 1 valid_rc = 0 if int(rc) <= 20 : valid_rc = int(rc) - outofStockCount else : if outofStockCount == 0 : valid_rc = rc else : valid_rc = 20 - outofStockCount if len(priceArray) != 0 : priceAvg = round(sum(priceArray)/len(priceArray)) else : priceAvg = 0 #category Get categories = {} _category = bs.select('div.filter_wrap ul li dl.category dd ul li a') for i in range(len(_category)) : categoryName = _category[i].getText().strip() count = _category[i].select('em')[0].getText().strip() categoryName = re.sub(r"\([0-9]+\)","",categoryName).strip() count = re.sub(r"\(|\)","",count) categories[categoryName] = int(count) cate_tuple = sorted(categories.items(), key=lambda t : t[1], reverse=True) top3cate = '' top3cateSql = "" for j in range(len(cate_tuple)) : cates = list(cate_tuple[j]) if j < 3 : if j != 0 : top3cate = top3cate + "\t" #top3cateSql = top3cateSql + "','" top3cate = top3cate + cates[0] + "\t" + str(cates[1]) + "\t" + str(int(cates[1])/int(rc)) top3cateSql = top3cateSql + cates[0] + "','" + str(cates[1]) + "','" + str(int(cates[1])/int(rc)) + "','" if j == 0 : if utm_content == '' : utm_content = urllib.parse.quote_plus(cates[0]) if len(cate_tuple) == 1 : top3cate = top3cate + "\t\t\t\t\t\t" top3cateSql = top3cateSql + "',NULL,NULL,NULL,NULL,NULL,'" if len(cate_tuple) == 2 : top3cate = top3cate + "\t\t\t" top3cateSql = top3cateSql + "',NULL,NULL,'" #check naverTong query = urllib.parse.quote_plus(packed) nTongUrl = 'https://search.naver.com/search.naver' nTongUrl2 = '?sm=tab_hty.top&where=nexearch&query=' + query + '&oquery=' + query nTongUrl = nTongUrl + nTongUrl2 nTongResp = requests.get(nTongUrl) nTongResp.raise_for_status() nTongResp.encoding='UTF-8' nTongHtml = nTongResp.text nTongbs = bs4.BeautifulSoup(nTongHtml, 'html.parser') correctedTerm = nTongbs.select('div.sp_keyword dl dd em') if len(correctedTerm) != 0 : correctedKeyword = correctedTerm[0].getText().strip() else : correctedKeyword = 'N' tong_powerlink = nTongbs.select('div#power_link_body ul.lst_type li.lst') tong_misumi_txt = 0 tong_navimro_txt = 0 tong_imarket_txt = 0 tong_speedmall_txt = 0 if len(tong_powerlink) > 0 : for i in range(len(tong_powerlink)) : site = tong_powerlink[i].select('div.inner a.lnk_url')[0].getText().strip() if site == 'www.imarket.co.kr' : tong_imarket_txt = str(i + 1) if site == 'kr.misumi-ec.com' : tong_misumi_txt = str(i + 1) if site == 'www.navimro.com' : tong_navimro_txt = str(i + 1) if site == 'www.speedmall.co.kr' : tong_speedmall_txt = str(i + 1) #check Powerlink pwUrl = "https://ad.search.naver.com/search.naver?where=ad&sm=svc_nrs&query=" + query pwResp = requests.get(pwUrl) pwResp.raise_for_status() pwResp.encoding='UTF-8' pwHtml = pwResp.text pwbs = bs4.BeautifulSoup(pwHtml, 'html.parser') pwResultCount = pwbs.select('div.search_result div.inner span.num_result')[0].getText().strip() pwResultCount = re.sub(r"[0-9]+\-[0-9]+\ \/\s","",pwResultCount) pwResultCount = re.sub(r"건","",pwResultCount) pw_misumi_txt = 0 pw_navimro_txt = 0 pw_imarket_txt = 0 pw_speedmall_txt = 0 if pwResultCount != 0 : pw_list = pwbs.select('div.ad_section ol.lst_type li.lst') for i in range(len(pw_list)) : site = pw_list[i].select('div.inner div.url_area a.url')[0].getText().strip() if site == 'http://www.imarket.co.kr' : pw_imarket_txt = str(i + 1) if site == 'http://kr.misumi-ec.com' : pw_misumi_txt = str(i + 1) if site == 'http://www.navimro.com' : pw_navimro_txt = str(i + 1) if site == 'http://www.speedmall.co.kr' : pw_speedmall_txt = str(i + 1) utm_content = utm_content + sp destURL = "http://www.imarket.co.kr/display/malls.do?_method=searchGoods&sc.row=20&collList=product&BIZ_CD=1010187&utm_source=naverPowerlink&utm_medium=prdcpc&sc.queryText=" + urlterm + "&utm_keyword=" + urllib.parse.quote_plus("$$") + query + "&utm_campaign=" + urllib.parse.quote_plus(utm_campaign) + "&utm_content=" + utm_content print(term + "\t" + packed + "\t" + str(rc) + "\t" + str(valid_rc) + "\t" + str(outofStockCount) + "\t" + str(sp_priceCount) + "\t" + str(priceAvg) + "\t" + top3cate + "\t" + correctedKeyword + "\t" + str(len(tong_powerlink)) + "\t" + str(pwResultCount) + "\t" + str(tong_imarket_txt) + "\t" + str(pw_imarket_txt) + "\t" + str(tong_navimro_txt) + "\t" + str(pw_navimro_txt) + "\t" + str(tong_misumi_txt) + "\t" + str(pw_misumi_txt) + "\t" + str(tong_speedmall_txt) + "\t" + str(pw_speedmall_txt) + "\t" + destURL ) # Connection 으로부터 Cursor 생성 curs = conn.cursor() # SQL문 실행 sql = "SELECT COUNT(*) FROM `naverpowerlinkterms` WHERE PWTerm = \"" + packed + "\"" curs.execute(sql) # 데이타 Fetch rows = curs.fetchall() #print(rows) # 전체 rows historyCheck = rows[0][0] if historyCheck < 1 : cursInsert = conn.cursor() insertSql = "INSERT INTO `naverpowerlinkterms` VALUES (NULL,'" + term + "','" + packed + "','" + str(rc) + "','" + str(valid_rc) + "','" + str(outofStockCount) + "','" + str(sp_priceCount) + "','" + str(priceAvg) + "','" insertSql = insertSql + top3cateSql insertSql = insertSql + correctedKeyword + "','" + str(len(tong_powerlink)) + "','" + str(pwResultCount) + "','" + str(tong_imarket_txt) + "','" + str(pw_imarket_txt) + "','" + str(tong_navimro_txt) + "','" + str(pw_navimro_txt) + "','" + str(tong_misumi_txt) + "','" + str(pw_misumi_txt) + "','" + str(tong_speedmall_txt) + "','" + str(pw_speedmall_txt) + "','" + destURL + "',NOW())" cursInsert.execute(insertSql) conn.commit() curs_NR = conn.cursor() delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\"" curs_NR.execute(delete_NRSql) conn.commit() else : cursDelete = conn.cursor() deleteSql = "DELETE FROM `naverpowerlinkterms` WHERE PWterm = \"" + packed + "\"" cursDelete.execute(deleteSql) conn.commit() cursInsert = conn.cursor() insertSql = "INSERT INTO `naverpowerlinkterms` VALUES (NULL,'" + term + "','" + packed + "','" + str(rc) + "','" + str(valid_rc) + "','" + str(outofStockCount) + "','" + str(sp_priceCount) + "','" + str(priceAvg) + "','" insertSql = insertSql + top3cateSql insertSql = insertSql + correctedKeyword + "','" + str(len(tong_powerlink)) + "','" + str(pwResultCount) + "','" + str(tong_imarket_txt) + "','" + str(pw_imarket_txt) + "','" + str(tong_navimro_txt) + "','" + str(pw_navimro_txt) + "','" + str(tong_misumi_txt) + "','" + str(pw_misumi_txt) + "','" + str(tong_speedmall_txt) + "','" + str(pw_speedmall_txt) + "','" + destURL + "',NOW())" cursInsert.execute(insertSql) conn.commit() curs_NR = conn.cursor() delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\"" curs_NR.execute(delete_NRSql) conn.commit() conn.close() consumtime = datetime.now() - startTime logging.warning(term + "\t" + str(consumtime))