80 lines
3.0 KiB
Python
80 lines
3.0 KiB
Python
import pandas as pd
|
|
import requests, bs4, urllib, sys, re, math, logging, os
|
|
from datetime import datetime
|
|
import subprocess
|
|
|
|
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.append(ScriptLocation)
|
|
import naver
|
|
|
|
class bulkCrawler:
|
|
|
|
def __init__(self,campaign,xlsxfile):
|
|
self.startTime = datetime.now()
|
|
self.xlsxfile = xlsxfile
|
|
self.campaign = campaign
|
|
self.resfile = self.resFile()
|
|
self.df = self.dataFrame()
|
|
self.queries = self.queryTermList()
|
|
self.commands = self.crawlCmd()
|
|
|
|
def timeLog(self):
|
|
consumtime = datetime.now() - self.startTime
|
|
logging.warning("Completed.\t" + str(consumtime))
|
|
|
|
def resFile(self):
|
|
resfile = self.campaign + "_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
|
|
return resfile
|
|
|
|
#def dataFrame(self):
|
|
# df = pd.read_excel(self.xlsxfile, skiprows=[1])
|
|
# return df
|
|
|
|
def dataFrame(self):
|
|
df = pd.read_csv(self.xlsxfile, sep='\t', header=0, skiprows=[1])
|
|
return df
|
|
|
|
def writeHeader(self):
|
|
resHeader = "아이마켓텀 검색결과수 유효결과수 품절수 평균가격 top1카테고리 top1상품수 top2카테고리 top2상품수 top3카테고리 top3상품수 targetURL 광고텀 치환텀 slots advertisers 아이마켓(통검) 나비엠알오(통검) 미스미(통검) 스피드몰(통검) 아이마켓(광고) 나비엠알오(광고) 미스미(광고) 스피드몰(광고)"
|
|
insertheader = "echo \"" + resHeader + "\" > " + self.resfile
|
|
executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True)
|
|
executeinsertHeader.wait()
|
|
|
|
def queryTermList(self):
|
|
res = []
|
|
for i in range(len(self.df)):
|
|
_res = {}
|
|
_res['imarketTerm'] = str(self.df.iloc[i]['imarket term'])
|
|
_res['naverTerm'] = str(self.df.iloc[i]['naver term'])
|
|
_res['remarksTerm'] = str(self.df.iloc[i]['비고'])
|
|
if _res['remarksTerm'] == "nan":
|
|
if _res['naverTerm'] == "nan":
|
|
_res['query'] = _res['imarketTerm']
|
|
_res['mode'] = 'SEM'
|
|
else :
|
|
_res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm']
|
|
_res['mode'] = 'SEM'
|
|
else:
|
|
_res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm']
|
|
_res['mode'] = _res['remarksTerm']
|
|
res.append(_res)
|
|
return res
|
|
|
|
def crawlCmd(self):
|
|
cmd=[]
|
|
for i in range(len(self.queries)):
|
|
if self.queries[i]['mode'] == "SEM":
|
|
crawlCmd = "python3 /home/maddiekorea/py/stdCrawler.py" + " -m SEM -u " + str(self.campaign) + " -q \"" + str(self.queries[i]['query']) + "\" >> " + str(self.resfile)
|
|
else:
|
|
crawlCmd = "python3 /home/maddiekorea/py/exceptionCrawler.py \"" + str(self.queries[i]['imarketTerm']) + "\" \"" + str(self.queries[i]['naverTerm']) + "\" " + str(self.queries[i]['mode']) + " >> " + str(self.resfile)
|
|
cmd.append(crawlCmd)
|
|
return cmd
|
|
|
|
def excution(self,commandpointer):
|
|
startTime = datetime.now()
|
|
executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True)
|
|
executeCrawl.wait()
|
|
endTime = datetime.now() - startTime
|
|
logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + " " + self.queries[commandpointer]['query'] + "\t" + str(endTime))
|
|
|