python_apps/crwlers/relatedTermCollector.py
2023-11-03 14:49:12 +09:00

116 lines
4.0 KiB
Python

import requests, bs4, urllib, sys, re, math, logging, pymysql
from urllib import parse
from datetime import datetime
startTime = datetime.now()
conn = pymysql.connect(host='localhost', user='root', password='dlsxjvkzmdkdlakzpt!',db='crawler', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8')
def query(keyword,encoding) :
if encoding == 'euc-kr' :
query = keyword.encode('euc-kr')
query = parse.quote(query)
else :
query = urllib.parse.quote_plus(keyword)
return query
def url(type,query) :
if type == 'naver' :
url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=" + query + "&oquery=" + query
if type == 'nShop' :
url = "https://search.shopping.naver.com/search/all.nhn?origQuery=" + query + "&pagingIndex=1&pagingSize=80&viewType=list&sort=rel&frm=NVSHTTL&query=" + query
if type == 'navimro' :
url = "https://www.navimro.com/s/?q=" + query + "&disp=1"
if type == 'misumi' :
url = "https://kr.misumi-ec.com/vona2/result/?Keyword=" + query + "&isReSearch=1"
if type == 'imarket' :
url = "http://www.imarket.co.kr/display/malls.do?_method=searchGoods&sc.queryText=" + query
return url
def getHtml(url,encoding) :
resp = requests.get(url)
resp.raise_for_status()
resp.encoding = encoding
html = resp.text
bs = bs4.BeautifulSoup(html, 'html.parser')
return bs
def suggestionList(type,bs) :
if type == 'naver' :
_list = bs.select('ul._related_keyword_ul li a')
if type == 'nShop' :
_list = bs.select('div.co_relation_srh ul li a')
if type == 'navimro' :
_list = bs.select('ul#suggestion-kw li.dp-block a.dp-block p.dp-block')
if type == 'misumi' :
_list = bs.select('ul.m-linkList--keyword li a')
return _list
def scoreBuild(type,lenth,index) :
if type == 'naver' : scoreboost = 0.001
if type == 'nShop' : scoreboost = 0.015
if type == 'navimro' : scoreboost = 0.05
if type == 'misumi' : scoreboost = 0.02
if lenth > 1 :
revRank = math.log(lenth-index,lenth) + scoreboost
else : revRank = 1 + scoreboost
return revRank
typeList = ['naver','nShop','navimro','misumi']
#typeList = ['nShop','navimro','misumi']
####################################
#crawled = list()
for i in range(len(typeList)) :
crawlQuery = query(str(sys.argv[1]),'UTF-8')
targetUrl = url(typeList[i],crawlQuery)
bs = getHtml(targetUrl,'UTF-8')
_list = suggestionList(typeList[i],bs)
# aff_res = []
curs = conn.cursor()
sql = "SELECT COUNT(*) FROM `relatedkeywords` WHERE srcTerm = \"" + str(sys.argv[1]) + "\" AND source = \"" + typeList[i] + "\""
curs.execute(sql)
rows = curs.fetchall()
historyCheck = rows[0][0]
if historyCheck >= 1 :
cursdel = conn.cursor()
deleteSql = "DELETE FROM `relatedkeywords` WHERE source = \"" + typeList[i] + "\" AND srcTerm = \"" + str(sys.argv[1]) + "\""
cursdel.execute(deleteSql)
conn.commit()
for j in range(len(_list)) :
revRank = scoreBuild(typeList[i],len(_list),j)
imQuery = query(str(_list[j].getText().strip()),'euc-kr')
imUrl = url('imarket',imQuery)
imBs = getHtml(imUrl,'EUC-KR')
rc = imBs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip()
rc = re.sub(r"\,","",rc)
cursAd = conn.cursor()
adcheckSql = "SELECT COUNT(*) FROM `naverpowerlinkterms` WHERE PWTerm = \"" + re.sub(r"\ ","",_list[j].getText().strip()) + "\""
cursAd.execute(adcheckSql)
checkrow = cursAd.fetchall()
adcheck = checkrow[0][0]
if adcheck >= 1 : adcheckflag = "Y"
else : adcheckflag = "N"
cursInsert = conn.cursor()
insertSql = "INSERT INTO `relatedkeywords` VALUES (NULL,'" + typeList[i] + "','" + str(sys.argv[1]) + "','" + _list[j].getText().strip() + "','" + str(revRank) + "','" + str(rc) + "',NOW(),'" + adcheckflag + "')"
cursInsert.execute(insertSql)
conn.commit()
# res = {
# 'source': typeList[i],
# 'source_term': sys.argv[1],
# 'ext_term': _list[j].getText().strip(),
# 'rank': (len(_list)-j)
# }
# aff_res.append(res)
# crawled.append(aff_res.copy())
#print(crawled[0][1]['ext_term'])
consumtime = datetime.now() - startTime
logging.warning(str(consumtime))