146 lines
4.7 KiB
Python
146 lines
4.7 KiB
Python
#!/usr/bin/env python
|
|
import requests, bs4, urllib, sys, re, math, logging, pymysql, json
|
|
from urllib import parse
|
|
from datetime import datetime
|
|
|
|
startTime = datetime.now()
|
|
|
|
conn = pymysql.connect(host='localhost', user='maddiekorea', password='mad(#lin',db='maddiekorea', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8')
|
|
|
|
def query(keyword,encoding) :
|
|
if encoding == 'euc-kr' :
|
|
query = keyword.encode('euc-kr')
|
|
query = parse.quote(query)
|
|
else :
|
|
query = urllib.parse.quote_plus(keyword)
|
|
return query
|
|
|
|
def url(type,query) :
|
|
if type == 'naver' :
|
|
url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=" + query + "&oquery=" + query
|
|
if type == 'nShop' :
|
|
url = "https://search.shopping.naver.com/search/all?query=" + query + "&frm=NVSHATC"
|
|
if type == 'navimro' :
|
|
url = "https://www.navimro.com/s/?q=" + query + "&disp=1"
|
|
if type == 'misumi' :
|
|
url = "https://kr.misumi-ec.com/vona2/result/?Keyword=" + query + "&isReSearch=1"
|
|
if type == 'imarket' :
|
|
url = "http://www.imarket.co.kr/display/malls.do?_method=searchGoods&sc.queryText=" + query
|
|
if type == '11st' :
|
|
url = "http://search.11st.co.kr/Search.tmall?kwd=" + query
|
|
return url
|
|
|
|
def getHtml(url,encoding) :
|
|
resp = requests.get(url)
|
|
resp.raise_for_status()
|
|
resp.encoding = encoding
|
|
html = resp.text
|
|
bs = bs4.BeautifulSoup(html, 'html.parser')
|
|
return bs
|
|
|
|
def suggestionList(type,bs) :
|
|
if type == 'naver' :
|
|
_list = bs.select('ul._related_keyword_ul li a')
|
|
if type == 'nShop' :
|
|
module = bs.select('div[class*="relatedTags_relation_srh_"]')
|
|
if len(module) > 0 :
|
|
_list = module[0].select('ul li a')
|
|
else :
|
|
_list = []
|
|
if type == 'navimro' :
|
|
_list = bs.select('ul#suggestion-kw li.dp-block a.dp-block p.dp-block')
|
|
if type == 'misumi' :
|
|
_list = bs.select('ul.m-linkList--keyword li a')
|
|
if type == '11st' :
|
|
bs = str(bs)
|
|
sp = re.split('window\.searchDataFactory\.relatedKeywordsList\ \=\ ',bs)
|
|
res = re.split('\n',sp[1])
|
|
tub = re.sub('\;','',res[0])
|
|
data = json.loads(tub)
|
|
if len(data) == 0 :
|
|
_list = []
|
|
else :
|
|
_list = []
|
|
for i in range(len(data)) :
|
|
_list.append(data[i]['relatedKwd'])
|
|
return _list
|
|
|
|
def scoreBuild(type,lenth,index) :
|
|
if type == 'naver' : scoreboost = 0.001
|
|
if type == 'nShop' : scoreboost = 0.015
|
|
if type == '11st' : scoreboost = 0.015
|
|
if type == 'navimro' : scoreboost = 0.05
|
|
if type == 'misumi' : scoreboost = 0.02
|
|
if lenth > 1 :
|
|
revRank = math.log(lenth-index,lenth) + scoreboost
|
|
else : revRank = 1 + scoreboost
|
|
return revRank
|
|
|
|
#typeList = ['naver','nShop','navimro','misumi']
|
|
#typeList = ['nShop','navimro','misumi']
|
|
typeList = ['11st','nShop','navimro']
|
|
#typeList = ['nShop']
|
|
####################################
|
|
|
|
#crawled = list()
|
|
for i in range(len(typeList)) :
|
|
if typeList[i] != '11st' :
|
|
crawlQuery = query(str(sys.argv[1]),'UTF-8')
|
|
else :
|
|
crawlQuery = query(str(sys.argv[1]),'euc-kr')
|
|
|
|
targetUrl = url(typeList[i],crawlQuery)
|
|
if typeList[i] != '11st' :
|
|
bs = getHtml(targetUrl,'UTF-8')
|
|
else :
|
|
bs = getHtml(targetUrl,'euc-kr')
|
|
|
|
_list = suggestionList(typeList[i],bs)
|
|
#print(typeList[i] + str(len(_list)))
|
|
# aff_res = []
|
|
|
|
curs = conn.cursor()
|
|
sql = "SELECT COUNT(*) FROM `relatedkeywords` WHERE srcTerm = \"" + str(sys.argv[1]) + "\" AND source = \"" + typeList[i] + "\""
|
|
curs.execute(sql)
|
|
rows = curs.fetchall()
|
|
historyCheck = rows[0][0]
|
|
if historyCheck >= 1 :
|
|
#cursdel = conn.cursor()
|
|
deleteSql = "DELETE FROM `relatedkeywords` WHERE source = \"" + typeList[i] + "\" AND srcTerm = \"" + str(sys.argv[1]) + "\""
|
|
#cursdel.execute(deleteSql)
|
|
#conn.commit()
|
|
print(deleteSql)
|
|
|
|
for j in range(len(_list)) :
|
|
revRank = scoreBuild(typeList[i],len(_list),j)
|
|
if typeList[i] != '11st' :
|
|
imTerm = str(_list[j].getText().strip())
|
|
else :
|
|
imTerm = _list[j]
|
|
imQuery = query(imTerm,'euc-kr')
|
|
imUrl = url('imarket',imQuery)
|
|
imBs = getHtml(imUrl,'EUC-KR')
|
|
rc = imBs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip()
|
|
rc = re.sub(r"\,","",rc)
|
|
|
|
#cursAd = conn.cursor()
|
|
#adcheckSql = "SELECT COUNT(*) FROM `naverpowerlinkterms` WHERE PWTerm = \"" + re.sub(r"\ ","",_list[j].getText().strip()) + "\""
|
|
#cursAd.execute(adcheckSql)
|
|
#checkrow = cursAd.fetchall()
|
|
#adcheck = checkrow[0][0]
|
|
#if adcheck >= 1 : adcheckflag = "Y"
|
|
#else : adcheckflag = "N"
|
|
adcheckflag = "N"
|
|
|
|
if typeList[i] != '11st' :
|
|
outputTerm = str(_list[j].getText().strip())
|
|
else :
|
|
outputTerm = _list[j]
|
|
#cursInsert = conn.cursor()
|
|
insertSql = "INSERT INTO `relatedkeywords` VALUES (NULL,'" + typeList[i] + "','" + str(sys.argv[1]) + "','" + outputTerm + "','" + str(revRank) + "','" + str(rc) + "',NOW(),'" + adcheckflag + "')"
|
|
#cursInsert.execute(insertSql)
|
|
#conn.commit()
|
|
print(insertSql)
|
|
|
|
consumtime = datetime.now() - startTime
|
|
logging.warning(str(consumtime)) |