import pandas as pd import requests, bs4, urllib, sys, re, math, logging, os from datetime import datetime import subprocess ScriptLocation = os.path.dirname(os.path.abspath(__file__)) sys.path.append(ScriptLocation) import naver class bulkCrawler: def __init__(self,campaign,xlsxfile): self.startTime = datetime.now() self.xlsxfile = xlsxfile self.campaign = campaign self.resfile = self.resFile() self.df = self.dataFrame() self.queries = self.queryTermList() self.commands = self.crawlCmd() def timeLog(self): consumtime = datetime.now() - self.startTime logging.warning("Completed.\t" + str(consumtime)) def resFile(self): resfile = self.campaign + "_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv" return resfile #def dataFrame(self): # df = pd.read_excel(self.xlsxfile, skiprows=[1]) # return df def dataFrame(self): df = pd.read_csv(self.xlsxfile, sep='\t', header=0, skiprows=[1]) return df def writeHeader(self): resHeader = "아이마켓텀 검색결과수 유효결과수 품절수 평균가격 top1카테고리 top1상품수 top2카테고리 top2상품수 top3카테고리 top3상품수 targetURL 광고텀 치환텀 slots advertisers 아이마켓(통검) 나비엠알오(통검) 미스미(통검) 스피드몰(통검) 아이마켓(광고) 나비엠알오(광고) 미스미(광고) 스피드몰(광고)" insertheader = "echo \"" + resHeader + "\" > " + self.resfile executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True) executeinsertHeader.wait() def queryTermList(self): res = [] for i in range(len(self.df)): _res = {} _res['imarketTerm'] = str(self.df.iloc[i]['imarket term']) _res['naverTerm'] = str(self.df.iloc[i]['naver term']) _res['remarksTerm'] = str(self.df.iloc[i]['비고']) if _res['remarksTerm'] == "nan": if _res['naverTerm'] == "nan": _res['query'] = _res['imarketTerm'] _res['mode'] = 'SEM' else : _res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm'] _res['mode'] = 'SEM' else: _res['query'] = _res['imarketTerm'] + ":" + _res['naverTerm'] _res['mode'] = _res['remarksTerm'] res.append(_res) return res def crawlCmd(self): cmd=[] for i in range(len(self.queries)): if self.queries[i]['mode'] == "SEM": crawlCmd = "python3 /home/maddiekorea/py/stdCrawler.py" + " -m SEM -u " + str(self.campaign) + " -q \"" + str(self.queries[i]['query']) + "\" >> " + str(self.resfile) else: crawlCmd = "python3 /home/maddiekorea/py/exceptionCrawler.py \"" + str(self.queries[i]['imarketTerm']) + "\" \"" + str(self.queries[i]['naverTerm']) + "\" " + str(self.queries[i]['mode']) + " >> " + str(self.resfile) cmd.append(crawlCmd) return cmd def excution(self,commandpointer): startTime = datetime.now() executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True) executeCrawl.wait() endTime = datetime.now() - startTime logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + " " + self.queries[commandpointer]['query'] + "\t" + str(endTime))