587 lines
11 KiB
Python
587 lines
11 KiB
Python
#version 201904
|
|
|
|
import requests, bs4, urllib, sys, re, math, logging, pymysql
|
|
|
|
from urllib import parse
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
startTime = datetime.now()
|
|
|
|
|
|
|
|
conn = pymysql.connect(host='localhost', user='root', password='dlsxjvkzmdkdlakzpt!',db='crawler', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8')
|
|
|
|
|
|
|
|
url = "http://www.imarket.co.kr/display/malls.do"
|
|
|
|
def query(keyword) :
|
|
|
|
query = keyword.encode('euc-kr')
|
|
|
|
return query
|
|
|
|
def parameters( page, query ) :
|
|
|
|
data = {
|
|
|
|
'_method': 'searchGoods',
|
|
|
|
'sc.page': page,
|
|
|
|
'sc.row': '20',
|
|
|
|
'sc.viewType': 'list',
|
|
|
|
'sc.queryText': query
|
|
|
|
}
|
|
|
|
return data
|
|
|
|
def headers() :
|
|
|
|
headers = {
|
|
|
|
'Content-Type': 'application/x-www-form-urlencoded',
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
|
|
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
|
|
'Cache-Control': 'no-cache',
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
'Host': 'www.imarket.co.kr',
|
|
|
|
'Pragma': 'no-cache',
|
|
|
|
'Referer': 'http://www.imarket.co.kr/',
|
|
|
|
'Save-Data': 'on',
|
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
|
|
|
|
}
|
|
|
|
return headers
|
|
|
|
def pageRequest( url, parameters, headers ) :
|
|
|
|
resp = requests.get(url, params = parameters, headers = headers)
|
|
|
|
resp.raise_for_status()
|
|
|
|
resp.encoding='EUC-KR'
|
|
|
|
html = resp.text
|
|
|
|
return html
|
|
|
|
|
|
|
|
# sys.argv[1] 오리지널 텀
|
|
|
|
# sys.argv[2] replaced term for naver
|
|
|
|
# sys.argv[3] utm campaign
|
|
|
|
# sys.argv[4] utm content
|
|
|
|
|
|
|
|
utm_campaign = 'test'
|
|
|
|
utm_content = ''
|
|
|
|
sp = ''
|
|
|
|
|
|
|
|
term = str(sys.argv[1])
|
|
|
|
|
|
|
|
if len(sys.argv) == 2:
|
|
|
|
if term.find(" ") == -1 :
|
|
|
|
packed = term
|
|
|
|
else :
|
|
|
|
packed = re.sub(r"\ ","",term)
|
|
|
|
elif len(sys.argv) > 2 :
|
|
|
|
if sys.argv[2] == '-' or sys.argv[2] == '' :
|
|
|
|
if term.find(" ") == -1 :
|
|
|
|
packed = term
|
|
|
|
else :
|
|
|
|
packed = re.sub(r"\ ","",term)
|
|
|
|
else :
|
|
|
|
packed = re.sub(r"\ ","",sys.argv[2])
|
|
|
|
if len(sys.argv) == 4 :
|
|
|
|
if sys.argv[3] : utm_campaign = str(sys.argv[3])
|
|
|
|
if len(sys.argv) == 5 :
|
|
|
|
if sys.argv[3] : utm_campaign = str(sys.argv[3])
|
|
|
|
if sys.argv[4] : utm_content = str(sys.argv[4])
|
|
|
|
|
|
|
|
if term != packed :
|
|
|
|
if re.sub(r"\ ","",term) == re.sub(r"\ ","",packed) : sp = "sp"
|
|
|
|
else : sp = "df"
|
|
|
|
|
|
|
|
urlterm = parse.quote(query(term))
|
|
|
|
htmlHead = pageRequest(url, parameters(1,query(term)), headers())
|
|
|
|
|
|
|
|
bs = bs4.BeautifulSoup(htmlHead, 'html.parser')
|
|
|
|
|
|
|
|
rc = bs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip()
|
|
|
|
rc = re.sub(r"\,","",rc)
|
|
|
|
|
|
|
|
if rc == '0' :
|
|
|
|
print(term + "\t" + packed + "\t" + "NoResult")
|
|
|
|
|
|
|
|
cursNR = conn.cursor()
|
|
|
|
deleteNRSql = "DELETE FROM `naverpowerlinkterms` WHERE term = \"" + term + "\" AND PWTerm = \"" + packed + "\""
|
|
|
|
cursNR.execute(deleteNRSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
curs_NR = conn.cursor()
|
|
|
|
delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\" AND PWTerm =\"" + packed + "\""
|
|
|
|
curs_NR.execute(delete_NRSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
cursNRIN = conn.cursor()
|
|
|
|
insetNRSql = "INSERT INTO `noresults` VALUES(NULL,'" + term + "','" + utm_campaign + "',NOW(),'" + packed + "')"
|
|
|
|
cursNRIN.execute(insetNRSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
conn.close()
|
|
|
|
else :
|
|
|
|
_list = bs.select('ul.prd_list_type li')
|
|
|
|
outofStockCount = 0
|
|
|
|
priceArray = []
|
|
|
|
sp_priceCount = 0
|
|
|
|
for i in range(len(_list)) :
|
|
|
|
price = _list[i].select('div.price_box span.sale_price em.num')[0].getText().strip()
|
|
|
|
price = re.sub(r"\,","",price)
|
|
|
|
if price == '특별할인가' :
|
|
|
|
price = 0
|
|
|
|
sp_priceCount = sp_priceCount + 1
|
|
|
|
moq = _list[i].select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value')
|
|
|
|
outofStock = _list[i].select('div.btns a')[0].getText().strip()
|
|
|
|
if price != 0 and outofStock == '장바구니' :
|
|
|
|
priceArray.insert(i,int(price)*int(moq))
|
|
|
|
|
|
|
|
if outofStock == "일시품절" :
|
|
|
|
outofStockCount = outofStockCount + 1
|
|
|
|
|
|
|
|
valid_rc = 0
|
|
|
|
|
|
|
|
if int(rc) <= 20 :
|
|
|
|
valid_rc = int(rc) - outofStockCount
|
|
|
|
else :
|
|
|
|
if outofStockCount == 0 :
|
|
|
|
valid_rc = rc
|
|
|
|
else :
|
|
|
|
valid_rc = 20 - outofStockCount
|
|
|
|
|
|
|
|
if len(priceArray) != 0 : priceAvg = round(sum(priceArray)/len(priceArray))
|
|
|
|
else : priceAvg = 0
|
|
|
|
|
|
|
|
#category Get
|
|
|
|
categories = {}
|
|
|
|
_category = bs.select('div.filter_wrap ul li dl.category dd ul li a')
|
|
|
|
for i in range(len(_category)) :
|
|
|
|
categoryName = _category[i].getText().strip()
|
|
|
|
count = _category[i].select('em')[0].getText().strip()
|
|
|
|
categoryName = re.sub(r"\([0-9]+\)","",categoryName).strip()
|
|
|
|
count = re.sub(r"\(|\)","",count)
|
|
|
|
categories[categoryName] = int(count)
|
|
|
|
|
|
|
|
cate_tuple = sorted(categories.items(), key=lambda t : t[1], reverse=True)
|
|
|
|
|
|
|
|
top3cate = ''
|
|
|
|
top3cateSql = ""
|
|
|
|
|
|
|
|
for j in range(len(cate_tuple)) :
|
|
|
|
cates = list(cate_tuple[j])
|
|
|
|
if j < 3 :
|
|
|
|
if j != 0 :
|
|
|
|
top3cate = top3cate + "\t"
|
|
|
|
#top3cateSql = top3cateSql + "','"
|
|
|
|
top3cate = top3cate + cates[0] + "\t" + str(cates[1]) + "\t" + str(int(cates[1])/int(rc))
|
|
|
|
top3cateSql = top3cateSql + cates[0] + "','" + str(cates[1]) + "','" + str(int(cates[1])/int(rc)) + "','"
|
|
|
|
if j == 0 :
|
|
|
|
if utm_content == '' :
|
|
|
|
utm_content = urllib.parse.quote_plus(cates[0])
|
|
|
|
if len(cate_tuple) == 1 :
|
|
|
|
top3cate = top3cate + "\t\t\t\t\t\t"
|
|
|
|
top3cateSql = top3cateSql + "',NULL,NULL,NULL,NULL,NULL,'"
|
|
|
|
if len(cate_tuple) == 2 :
|
|
|
|
top3cate = top3cate + "\t\t\t"
|
|
|
|
top3cateSql = top3cateSql + "',NULL,NULL,'"
|
|
|
|
|
|
|
|
#check naverTong
|
|
|
|
query = urllib.parse.quote_plus(packed)
|
|
|
|
nTongUrl = 'https://search.naver.com/search.naver'
|
|
|
|
nTongUrl2 = '?sm=tab_hty.top&where=nexearch&query=' + query + '&oquery=' + query
|
|
|
|
nTongUrl = nTongUrl + nTongUrl2
|
|
|
|
nTongResp = requests.get(nTongUrl)
|
|
|
|
nTongResp.raise_for_status()
|
|
|
|
nTongResp.encoding='UTF-8'
|
|
|
|
nTongHtml = nTongResp.text
|
|
|
|
nTongbs = bs4.BeautifulSoup(nTongHtml, 'html.parser')
|
|
|
|
correctedTerm = nTongbs.select('div.sp_keyword dl dd em')
|
|
|
|
|
|
|
|
if len(correctedTerm) != 0 :
|
|
|
|
correctedKeyword = correctedTerm[0].getText().strip()
|
|
|
|
else : correctedKeyword = 'N'
|
|
|
|
|
|
|
|
tong_powerlink = nTongbs.select('div#power_link_body ul.lst_type li.lst')
|
|
|
|
tong_misumi_txt = 0
|
|
|
|
tong_navimro_txt = 0
|
|
|
|
tong_imarket_txt = 0
|
|
|
|
tong_speedmall_txt = 0
|
|
|
|
|
|
|
|
if len(tong_powerlink) > 0 :
|
|
|
|
for i in range(len(tong_powerlink)) :
|
|
|
|
site = tong_powerlink[i].select('div.inner a.lnk_url')[0].getText().strip()
|
|
|
|
if site == 'www.imarket.co.kr' : tong_imarket_txt = str(i + 1)
|
|
|
|
if site == 'kr.misumi-ec.com' : tong_misumi_txt = str(i + 1)
|
|
|
|
if site == 'www.navimro.com' : tong_navimro_txt = str(i + 1)
|
|
|
|
if site == 'www.speedmall.co.kr' : tong_speedmall_txt = str(i + 1)
|
|
|
|
|
|
|
|
#check Powerlink
|
|
|
|
pwUrl = "https://ad.search.naver.com/search.naver?where=ad&sm=svc_nrs&query=" + query
|
|
|
|
pwResp = requests.get(pwUrl)
|
|
|
|
pwResp.raise_for_status()
|
|
|
|
pwResp.encoding='UTF-8'
|
|
|
|
pwHtml = pwResp.text
|
|
|
|
pwbs = bs4.BeautifulSoup(pwHtml, 'html.parser')
|
|
|
|
pwResultCount = pwbs.select('div.search_result div.inner span.num_result')[0].getText().strip()
|
|
|
|
pwResultCount = re.sub(r"[0-9]+\-[0-9]+\ \/\s","",pwResultCount)
|
|
|
|
pwResultCount = re.sub(r"건","",pwResultCount)
|
|
|
|
pw_misumi_txt = 0
|
|
|
|
pw_navimro_txt = 0
|
|
|
|
pw_imarket_txt = 0
|
|
|
|
pw_speedmall_txt = 0
|
|
|
|
|
|
|
|
if pwResultCount != 0 :
|
|
|
|
pw_list = pwbs.select('div.ad_section ol.lst_type li.lst')
|
|
|
|
for i in range(len(pw_list)) :
|
|
|
|
site = pw_list[i].select('div.inner div.url_area a.url')[0].getText().strip()
|
|
|
|
if site == 'http://www.imarket.co.kr' : pw_imarket_txt = str(i + 1)
|
|
|
|
if site == 'http://kr.misumi-ec.com' : pw_misumi_txt = str(i + 1)
|
|
|
|
if site == 'http://www.navimro.com' : pw_navimro_txt = str(i + 1)
|
|
|
|
if site == 'http://www.speedmall.co.kr' : pw_speedmall_txt = str(i + 1)
|
|
|
|
|
|
|
|
utm_content = utm_content + sp
|
|
|
|
|
|
|
|
destURL = "http://www.imarket.co.kr/display/malls.do?_method=searchGoods&sc.row=20&collList=product&BIZ_CD=1010187&utm_source=naverPowerlink&utm_medium=prdcpc&sc.queryText=" + urlterm + "&utm_keyword=" + urllib.parse.quote_plus("$$") + query + "&utm_campaign=" + urllib.parse.quote_plus(utm_campaign) + "&utm_content=" + utm_content
|
|
|
|
|
|
|
|
print(term + "\t" + packed + "\t"
|
|
|
|
+ str(rc) + "\t"
|
|
|
|
+ str(valid_rc) + "\t"
|
|
|
|
+ str(outofStockCount) + "\t"
|
|
|
|
+ str(sp_priceCount) + "\t"
|
|
|
|
+ str(priceAvg) + "\t"
|
|
|
|
+ top3cate + "\t"
|
|
|
|
+ correctedKeyword + "\t"
|
|
|
|
+ str(len(tong_powerlink)) + "\t"
|
|
|
|
+ str(pwResultCount) + "\t"
|
|
|
|
+ str(tong_imarket_txt) + "\t"
|
|
|
|
+ str(pw_imarket_txt) + "\t"
|
|
|
|
+ str(tong_navimro_txt) + "\t"
|
|
|
|
+ str(pw_navimro_txt) + "\t"
|
|
|
|
+ str(tong_misumi_txt) + "\t"
|
|
|
|
+ str(pw_misumi_txt) + "\t"
|
|
|
|
+ str(tong_speedmall_txt) + "\t"
|
|
|
|
+ str(pw_speedmall_txt) + "\t"
|
|
|
|
+ destURL
|
|
|
|
)
|
|
|
|
|
|
|
|
# Connection 으로부터 Cursor 생성
|
|
|
|
curs = conn.cursor()
|
|
|
|
# SQL문 실행
|
|
|
|
sql = "SELECT COUNT(*) FROM `naverpowerlinkterms` WHERE PWTerm = \"" + packed + "\""
|
|
|
|
curs.execute(sql)
|
|
|
|
# 데이타 Fetch
|
|
|
|
rows = curs.fetchall()
|
|
|
|
#print(rows) # 전체 rows
|
|
|
|
historyCheck = rows[0][0]
|
|
|
|
|
|
|
|
if historyCheck < 1 :
|
|
|
|
cursInsert = conn.cursor()
|
|
|
|
insertSql = "INSERT INTO `naverpowerlinkterms` VALUES (NULL,'" + term + "','" + packed + "','" + str(rc) + "','" + str(valid_rc) + "','" + str(outofStockCount) + "','" + str(sp_priceCount) + "','" + str(priceAvg) + "','"
|
|
|
|
insertSql = insertSql + top3cateSql
|
|
|
|
insertSql = insertSql + correctedKeyword + "','" + str(len(tong_powerlink)) + "','" + str(pwResultCount) + "','" + str(tong_imarket_txt) + "','" + str(pw_imarket_txt) + "','" + str(tong_navimro_txt) + "','" + str(pw_navimro_txt) + "','" + str(tong_misumi_txt) + "','" + str(pw_misumi_txt) + "','" + str(tong_speedmall_txt) + "','" + str(pw_speedmall_txt) + "','" + destURL + "',NOW())"
|
|
|
|
cursInsert.execute(insertSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
curs_NR = conn.cursor()
|
|
|
|
delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\""
|
|
|
|
curs_NR.execute(delete_NRSql)
|
|
|
|
conn.commit()
|
|
|
|
else :
|
|
|
|
cursDelete = conn.cursor()
|
|
|
|
deleteSql = "DELETE FROM `naverpowerlinkterms` WHERE PWterm = \"" + packed + "\""
|
|
|
|
cursDelete.execute(deleteSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
cursInsert = conn.cursor()
|
|
|
|
insertSql = "INSERT INTO `naverpowerlinkterms` VALUES (NULL,'" + term + "','" + packed + "','" + str(rc) + "','" + str(valid_rc) + "','" + str(outofStockCount) + "','" + str(sp_priceCount) + "','" + str(priceAvg) + "','"
|
|
|
|
insertSql = insertSql + top3cateSql
|
|
|
|
insertSql = insertSql + correctedKeyword + "','" + str(len(tong_powerlink)) + "','" + str(pwResultCount) + "','" + str(tong_imarket_txt) + "','" + str(pw_imarket_txt) + "','" + str(tong_navimro_txt) + "','" + str(pw_navimro_txt) + "','" + str(tong_misumi_txt) + "','" + str(pw_misumi_txt) + "','" + str(tong_speedmall_txt) + "','" + str(pw_speedmall_txt) + "','" + destURL + "',NOW())"
|
|
|
|
cursInsert.execute(insertSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
curs_NR = conn.cursor()
|
|
|
|
delete_NRSql = "DELETE FROM `noresults` WHERE term = \"" + term + "\""
|
|
|
|
curs_NR.execute(delete_NRSql)
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
consumtime = datetime.now() - startTime
|
|
|
|
logging.warning(term + "\t" + str(consumtime)) |