114 lines
3.7 KiB
Python
114 lines
3.7 KiB
Python
import requests, bs4, urllib, sys, re, math, logging
|
|
from datetime import datetime
|
|
startTime = datetime.now()
|
|
|
|
def makeQuery(term) :
|
|
query = urllib.parse.quote_plus(term)
|
|
return query
|
|
|
|
def makeURL(num, query) :
|
|
url = "https://ad.search.naver.com/search.naver?where=ad&sm=svc_nrs&query=" + query + "&referenceId=&pagingIndex=" + str(num)
|
|
return url
|
|
|
|
def getHTML(url) :
|
|
resp = requests.get(url)
|
|
resp.raise_for_status()
|
|
resp.encoding='UTF-8'
|
|
html = resp.text
|
|
bs = bs4.BeautifulSoup(html, 'html.parser')
|
|
return bs
|
|
|
|
def resultCount(allHTML) :
|
|
res_count = allHTML.select('div.search_result div.inner span.num_result')[0].getText().strip()
|
|
res_count = re.sub(r"[0-9]+\-[0-9]+\ \/\s","",res_count)
|
|
res_count = re.sub(r"건","",res_count)
|
|
return int(res_count)
|
|
|
|
def paging(resultCount) :
|
|
paging = math.ceil(int(resultCount)/25)
|
|
return paging
|
|
|
|
term = str(sys.argv[1])
|
|
url = makeURL(1,makeQuery(str(sys.argv[1])))
|
|
content = getHTML(url)
|
|
resultCount = resultCount(content)
|
|
paging = paging(resultCount)
|
|
|
|
if resultCount == 0 : print(term + "\t" + "NoResult")
|
|
else :
|
|
_list = content.select('div.ad_section ol.lst_type li.lst')
|
|
|
|
for i in range(len(_list)) :
|
|
title = _list[i].select('div.inner a.lnk_tit')[0].getText().strip()
|
|
subTitle = _list[i].select('div.inner a.sub_tit')
|
|
if len(subTitle) == 0 : SubT = ""
|
|
else : SubT = subTitle[0].getText().strip()
|
|
ad_thumb = _list[i].select('div.ad_thumb')
|
|
if len(ad_thumb) == 0 : thumb = ""
|
|
else : thumb = "Y"
|
|
url = _list[i].select('div.inner div.url_area a.url')[0].getText().strip()
|
|
menu = _list[i].select('div.inner ul.lst_link li.item a')
|
|
menulink = ''
|
|
if len(menu) != 0 :
|
|
for j in range(len(menu)) :
|
|
menulink = menulink + menu[j].getText().strip()
|
|
if j != len(menu) - 1 :
|
|
menulink = menulink + "||"
|
|
ev = _list[i].select('div.inner p.promotion')
|
|
if len(ev) == 0 :
|
|
event = ""
|
|
description = _list[i].select('div.inner p.ad_dsc_inner')[0].getText().strip()
|
|
else :
|
|
event = ev[0].getText().strip()
|
|
event = re.sub(r"\n|\t|\s\s"," ",event)
|
|
event = re.sub(r"\s\s+","|",event)
|
|
description = _list[i].select('div.inner p.ad_dsc_inner')[0].getText().strip()
|
|
|
|
description = re.sub(r"\n|\t|\s\s"," ",description)
|
|
description = re.sub(r"\s\s+"," ",description)
|
|
|
|
itemlst = ''
|
|
lst_price = _list[i].select('div.inner ul.lst_price li.item a')
|
|
for k in range(len(lst_price)) :
|
|
itemlst = itemlst + lst_price[k].select('div.txt span')[0].getText().strip() + " (" + lst_price[k].select('span.price')[0].getText().strip() + ")"
|
|
if k != (len(lst_price)-1) : itemlst = itemlst + "||"
|
|
|
|
|
|
adPeriod = _list[i].select('div.inner div.period_area')[0].getText().strip()
|
|
adPeriod = re.sub(r"광고집행기간|\n","",adPeriod)
|
|
|
|
iconStats = ''
|
|
iconArea = _list[i].select('div.inner div.url_area span.ico_area span.ico')
|
|
if len(iconArea) != 0 :
|
|
for l in range(len(iconArea)) :
|
|
if len(iconArea[l].select('span.ico_npay')) != 0 :
|
|
iconStats = iconStats + 'nPay'
|
|
if len(iconArea[l].select('span.ico_nlogin')) != 0 :
|
|
iconStats = iconStats + 'nLogin'
|
|
if len(iconArea[l].select('span.ico_talktalk')) != 0 :
|
|
iconStats = iconStats + 'nTalkTalk'
|
|
if len(iconArea[l].select('span.ico_nreserve')) != 0 :
|
|
iconStats = iconStats + 'nReserve'
|
|
if len(iconArea) > 1 :
|
|
if l < (len(iconArea)-1) :
|
|
iconStats = iconStats + ","
|
|
else : iconStats = iconStats + ''
|
|
|
|
print(
|
|
term + "\t" +
|
|
str(resultCount) + "\t" +
|
|
str(i+1) + "\t" +
|
|
title + "\t" +
|
|
SubT + "\t" +
|
|
thumb + "\t" +
|
|
url + "\t" +
|
|
iconStats + "\t" +
|
|
menulink + "\t" +
|
|
event + "\t" +
|
|
description + "\t" +
|
|
itemlst + "\t" +
|
|
adPeriod + "\t"
|
|
)
|
|
|
|
consumtime = datetime.now() - startTime
|
|
logging.warning(term + "\t" + str(consumtime)) |