#!/usr/bin/env python import requests, bs4, urllib, sys, re, math, logging, pymysql, json from urllib import parse from datetime import datetime startTime = datetime.now() conn = pymysql.connect(host='localhost', user='maddiekorea', password='mad(#lin',db='maddiekorea', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8') def query(keyword,encoding) : if encoding == 'euc-kr' : query = keyword.encode('euc-kr') query = parse.quote(query) else : query = urllib.parse.quote_plus(keyword) return query def url(type,query) : if type == 'naver' : url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=" + query + "&oquery=" + query if type == 'nShop' : url = "https://search.shopping.naver.com/search/all?query=" + query + "&frm=NVSHATC" if type == 'navimro' : url = "https://www.navimro.com/s/?q=" + query + "&disp=1" if type == 'misumi' : url = "https://kr.misumi-ec.com/vona2/result/?Keyword=" + query + "&isReSearch=1" if type == 'imarket' : url = "http://www.imarket.co.kr/display/malls.do?_method=searchGoods&sc.queryText=" + query if type == '11st' : url = "http://search.11st.co.kr/Search.tmall?kwd=" + query return url def getHtml(url,encoding) : resp = requests.get(url) resp.raise_for_status() resp.encoding = encoding html = resp.text bs = bs4.BeautifulSoup(html, 'html.parser') return bs def suggestionList(type,bs) : if type == 'naver' : _list = bs.select('ul._related_keyword_ul li a') if type == 'nShop' : module = bs.select('div[class*="relatedTags_relation_srh_"]') if len(module) > 0 : _list = module[0].select('ul li a') else : _list = [] if type == 'navimro' : _list = bs.select('ul#suggestion-kw li.dp-block a.dp-block p.dp-block') if type == 'misumi' : _list = bs.select('ul.m-linkList--keyword li a') if type == '11st' : bs = str(bs) sp = re.split('window\.searchDataFactory\.relatedKeywordsList\ \=\ ',bs) res = re.split('\n',sp[1]) tub = re.sub('\;','',res[0]) data = json.loads(tub) if len(data) == 0 : _list = [] else : _list = [] for i in range(len(data)) : _list.append(data[i]['relatedKwd']) return _list def scoreBuild(type,lenth,index) : if type == 'naver' : scoreboost = 0.001 if type == 'nShop' : scoreboost = 0.015 if type == '11st' : scoreboost = 0.015 if type == 'navimro' : scoreboost = 0.05 if type == 'misumi' : scoreboost = 0.02 if lenth > 1 : revRank = math.log(lenth-index,lenth) + scoreboost else : revRank = 1 + scoreboost return revRank #typeList = ['naver','nShop','navimro','misumi'] #typeList = ['nShop','navimro','misumi'] typeList = ['11st','nShop','navimro'] #typeList = ['nShop'] #################################### #crawled = list() for i in range(len(typeList)) : if typeList[i] != '11st' : crawlQuery = query(str(sys.argv[1]),'UTF-8') else : crawlQuery = query(str(sys.argv[1]),'euc-kr') targetUrl = url(typeList[i],crawlQuery) if typeList[i] != '11st' : bs = getHtml(targetUrl,'UTF-8') else : bs = getHtml(targetUrl,'euc-kr') _list = suggestionList(typeList[i],bs) #print(typeList[i] + str(len(_list))) # aff_res = [] curs = conn.cursor() sql = "SELECT COUNT(*) FROM `relatedkeywords` WHERE srcTerm = \"" + str(sys.argv[1]) + "\" AND source = \"" + typeList[i] + "\"" curs.execute(sql) rows = curs.fetchall() historyCheck = rows[0][0] if historyCheck >= 1 : #cursdel = conn.cursor() deleteSql = "DELETE FROM `relatedkeywords` WHERE source = \"" + typeList[i] + "\" AND srcTerm = \"" + str(sys.argv[1]) + "\"" #cursdel.execute(deleteSql) #conn.commit() print(deleteSql) for j in range(len(_list)) : revRank = scoreBuild(typeList[i],len(_list),j) if typeList[i] != '11st' : imTerm = str(_list[j].getText().strip()) else : imTerm = _list[j] imQuery = query(imTerm,'euc-kr') imUrl = url('imarket',imQuery) imBs = getHtml(imUrl,'EUC-KR') rc = imBs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip() rc = re.sub(r"\,","",rc) #cursAd = conn.cursor() #adcheckSql = "SELECT COUNT(*) FROM `naverpowerlinkterms` WHERE PWTerm = \"" + re.sub(r"\ ","",_list[j].getText().strip()) + "\"" #cursAd.execute(adcheckSql) #checkrow = cursAd.fetchall() #adcheck = checkrow[0][0] #if adcheck >= 1 : adcheckflag = "Y" #else : adcheckflag = "N" adcheckflag = "N" if typeList[i] != '11st' : outputTerm = str(_list[j].getText().strip()) else : outputTerm = _list[j] #cursInsert = conn.cursor() insertSql = "INSERT INTO `relatedkeywords` VALUES (NULL,'" + typeList[i] + "','" + str(sys.argv[1]) + "','" + outputTerm + "','" + str(revRank) + "','" + str(rc) + "',NOW(),'" + adcheckflag + "')" #cursInsert.execute(insertSql) #conn.commit() print(insertSql) consumtime = datetime.now() - startTime logging.warning(str(consumtime))