python_apps/crwlers/lib/bulkCrawler.py
2023-11-03 14:49:12 +09:00

80 lines
3.0 KiB
Python

import pandas as pd
import requests, bs4, urllib, sys, re, math, logging, os
from datetime import datetime
import subprocess
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
sys.path.append(ScriptLocation)
import naver
class bulkCrawler:
def __init__(self,campaign,xlsxfile):
self.startTime = datetime.now()
self.xlsxfile = xlsxfile
self.campaign = campaign
self.resfile = self.resFile()
self.df = self.dataFrame()
self.queries = self.queryTermList()
self.commands = self.crawlCmd()
def timeLog(self):
consumtime = datetime.now() - self.startTime
logging.warning("Completed.\t" + str(consumtime))
def resFile(self):
resfile = self.campaign + "_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
return resfile
#def dataFrame(self):
# df = pd.read_excel(self.xlsxfile, skiprows=[1])
# return df
def dataFrame(self):
df = pd.read_csv(self.xlsxfile, sep='\t', header=0, skiprows=[1])
return df
def writeHeader(self):
resHeader = "아이마켓텀 검색결과수 유효결과수 품절수 평균가격 top1카테고리 top1상품수 top2카테고리 top2상품수 top3카테고리 top3상품수 targetURL 광고텀 치환텀 slots advertisers 아이마켓(통검) 나비엠알오(통검) 미스미(통검) 스피드몰(통검) 아이마켓(광고) 나비엠알오(광고) 미스미(광고) 스피드몰(광고)"
insertheader = "echo \"" + resHeader + "\" > " + self.resfile
executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True)
executeinsertHeader.wait()
def queryTermList(self):
res = []
for i in range(len(self.df)):
_res = {}
_res['imarketTerm'] = str(self.df.iloc[i]['imarket term'])
_res['naverTerm'] = str(self.df.iloc[i]['naver term'])
_res['remarksTerm'] = str(self.df.iloc[i]['비고'])
if _res['remarksTerm'] == "nan":
if _res['naverTerm'] == "nan":
_res['query'] = _res['imarketTerm']
_res['mode'] = 'SEM'
else :
_res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm']
_res['mode'] = 'SEM'
else:
_res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm']
_res['mode'] = _res['remarksTerm']
res.append(_res)
return res
def crawlCmd(self):
cmd=[]
for i in range(len(self.queries)):
if self.queries[i]['mode'] == "SEM":
crawlCmd = "python3 /home/maddiekorea/py/stdCrawler.py" + " -m SEM -u " + str(self.campaign) + " -q \"" + str(self.queries[i]['query']) + "\" >> " + str(self.resfile)
else:
crawlCmd = "python3 /home/maddiekorea/py/exceptionCrawler.py \"" + str(self.queries[i]['imarketTerm']) + "\" \"" + str(self.queries[i]['naverTerm']) + "\" " + str(self.queries[i]['mode']) + " >> " + str(self.resfile)
cmd.append(crawlCmd)
return cmd
def excution(self,commandpointer):
startTime = datetime.now()
executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True)
executeCrawl.wait()
endTime = datetime.now() - startTime
logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + " " + self.queries[commandpointer]['query'] + "\t" + str(endTime))