python_apps/crwlers/lib/naver.py
2023-11-03 14:49:12 +09:00

123 lines
4.0 KiB
Python

#!/usr/bin/env python
from datetime import datetime
import bs4, re
import logging
import requests
class NaverGet:
def __init__(self, keyword):
self.startTime = datetime.now()
self.keyword = keyword.replace(" ","")
self.pwUrl = "https://ad.search.naver.com/search.naver"
self.comboUrl = "https://search.naver.com/search.naver"
self.combobs = self.getHTML(self.comboUrl,self.urlParameters("combo"))
self.Pwbs = self.getHTML(self.pwUrl,self.urlParameters("pw"))
self.comboList = self.getComboPWitems()
self.PWList = self.getAllAds()
#def __del__(self):
# consumtime = datetime.now() - self.startTime
# logging.warning(self.keyword + "_naver : " + str(consumtime))
def urlParameters(self,mode):
if mode == "pw":
data = {
'where': 'ad',
'query': self.keyword
}
if mode == "combo":
data = {
'sm': 'tab_hty.top',
'where': 'nexearch',
'query': self.keyword,
'oquery': self.keyword
}
return data
def getHTML(self,url,urlparam):
resp = requests.get(url, params=urlparam)
resp.raise_for_status()
resp.encoding = 'UTF-8'
html = resp.text
bs = bs4.BeautifulSoup(html, 'html.parser')
return bs
def getComboPWitems(self):
_list = self.combobs.select('div#power_link_body ul.lst_type li.lst')
return _list
def getAllAds(self):
_list = self.Pwbs.select('div.ad_section ol.lst_type li.lst')
return _list
def correctedTerm(self):
correctedTerm = self.combobs.select('div.sp_keyword dl dd em')
if len(correctedTerm) != 0:
correctedKeyword = correctedTerm[0].getText().strip()
else:
correctedKeyword = ''
return correctedKeyword
def AllAdvitiserCount(self):
res = self.Pwbs.select('div.search_result div.inner span.num_result')[0].getText().strip()
res = re.sub(r"[0-9]+\-[0-9]+\ \/\s", "", res)
res = re.sub(r"", "", res)
return int(res)
def getComboSlotCount(self):
res = len(self.getComboPWitems())
return res
def getComboRank(self):
bs = self.comboList
dict = {
'ImarketCombo': '',
'NavimroCombo': '',
'MisumiCombo': '',
'SpeedmallCombo': ''
}
for i in range(len(bs)):
site = bs[i].select('div.inner a.lnk_url')[0].getText().strip()
if 'www.imarket.co.kr' in site:
dict['ImarketCombo'] = str(i + 1)
if 'www.navimro.com' in site:
dict['NavimroCombo'] = str(i + 1)
if 'kr.misumi-ec.com' in site:
dict['MisumiCombo'] = str(i + 1)
if 'www.speedmall.co.kr' in site:
dict['SpeedmallCombo'] = str(i + 1)
return dict
def getPWRank(self):
bs = self.PWList
dict = {
'ImarketPw': '',
'NavimroPw': '',
'MisumiPw': '',
'SpeedmallPw': ''
}
for i in range(len(bs)):
site = bs[i].select('div.inner div.url_area a.url')[0].getText().strip()
if 'www.imarket.co.kr' in site:
dict['ImarketPw'] = str(i + 1)
if 'www.navimro.com' in site:
dict['NavimroPw'] = str(i + 1)
if 'kr.misumi-ec.com' in site:
dict['MisumiPw'] = str(i + 1)
if 'www.speedmall.co.kr' in site:
dict['SpeedmallPw'] = str(i + 1)
return dict
def getSummaryResult(self):
resdic = {}
resdic['nTerm'] = self.keyword
resdic['correctedTerm'] = self.correctedTerm()
resdic['comboSlots'] = self.getComboSlotCount()
resdic['Advertisers'] = self.AllAdvitiserCount()
resdic.update(self.getComboRank())
resdic.update(self.getPWRank())
return resdic