python_apps/crwlers/imarket_nlp.py
2023-11-03 14:49:12 +09:00

134 lines
4.4 KiB
Python

#!/usr/bin/env python
#version 201904
import requests, bs4, urllib, sys, re, math, logging
from urllib import parse
from datetime import datetime
from konlpy.tag import Mecab
startTime = datetime.now()
mecab = Mecab()
url = "http://www.imarket.co.kr/display/malls.do"
def query(keyword) :
query = keyword.encode('euc-kr')
return query
def parameters( page, query ) :
data = {
'_method': 'searchGoods',
'sc.page': page,
'sc.row': '20',
'sc.viewType': 'list',
'sc.queryText': query
}
return data
def headers() :
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'www.imarket.co.kr',
'Pragma': 'no-cache',
'Referer': 'http://www.imarket.co.kr/',
'Save-Data': 'on',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
return headers
def pageRequest( url, parameters, headers ) :
resp = requests.get(url, params = parameters, headers = headers)
resp.raise_for_status()
resp.encoding='EUC-KR'
html = resp.text
return html
def checksimilarity(term,obj) :
termlist = mecab.morphs(term)
objlist = mecab.morphs(obj)
matched = set(termlist) & set(objlist)
res = len(matched)/len(termlist)
return res
def addspace(term):
termlist = mecab.morphs(term)
res = ''
for i in range(len(termlist)) :
res = res + termlist[i]
if i != (len(termlist) - 1) :
res = res + " "
return res
term = str(addspace(sys.argv[1]))
urlterm = parse.quote(query(term))
htmlHead = pageRequest(url, parameters(1,query(term)), headers())
bs = bs4.BeautifulSoup(htmlHead, 'html.parser')
rc = bs.select('div.tit_category_wrap h2.tit_result span em')[0].getText().strip()
rc = re.sub(r"\,","",rc)
if rc == '0' : print(term + "\t" + urlterm + "\t" + "NoResult")
_list = bs.select('ul.prd_list_type li')
for i in range(len(_list)) :
prdCode = _list[i].select('div.info_box span.prd_code')[0].getText().strip()
prdCode = re.sub(r"^상품코드\ \:\ ","",prdCode)
if len(_list[i].select('div.info_box a.tit')) < 1 : logging.error(term + " : " + str(i+1))
prdName = _list[i].select('div.info_box a.tit')[0].getText().strip()
# 검색어 상품명 매칭 체크 추가 (형태소분석)
matchRate = checksimilarity(term,prdName)
promoMsg = _list[i].select('div.info_box p.prd_promo')[0].getText().strip()
price = _list[i].select('div.price_box span.sale_price em.num')[0].getText().strip()
price = re.sub(r"\,","",price)
couponArr = _list[i].select('div.price_box span.discount em.num')
if len(couponArr) != 0 : coupon = couponArr[0].getText().strip()
else : coupon = '';
moq = _list[i].select('div.amount_box span.btn_wrap label input.pr-number')[0].get('value')
imgURL = _list[i].select('div.img_box a img')[0].get('src')
imgTagArr = _list[i].select('div.img_box a span')
imgTags = ''
for l in range(len(imgTagArr)) :
imgTags = imgTags + imgTagArr[l].getText().strip()
if l != (len(imgTagArr)-1) : imgTags = imgTags + ","
tagArr = _list[i].select('div.info_box p.info_box02 span')
tagData = ''
for k in range(len(tagArr)) :
tagData = tagData + tagArr[k].getText().strip()
if k != (len(tagArr)-1) : tagData = tagData + ","
outofStock = _list[i].select('div.btns a')[0].getText().strip()
if outofStock == "장바구니" : outofStock = "판매중"
textArr = _list[i].select('div.info_box p.prd_info span')
txtdata = ''
for j in range(len(textArr)) :
txtdata = txtdata + textArr[j].getText().strip()
if j != (len(textArr)-1) : txtdata = txtdata + "\t"
print(term + "\t" + urlterm + "\t" + str(rc) + "\t" +
str(i+1) + "\t" + prdCode + "\t" + prdName + "\t" +
str(matchRate) + "\t" + promoMsg + "\t" + price + "\t" +
coupon + "\t" + moq + "\t" + imgTags + "\t" +
tagData + "\t" + imgURL + "\t" + outofStock + "\t" + txtdata)
consumtime = datetime.now() - startTime
logging.warning(term + "\t" + str(consumtime))