import pandas as pd import os,sys, re, logging from urllib import parse from urllib.parse import unquote from datetime import datetime from multiprocessing import Process, Pool import subprocess ScriptLocation = os.path.dirname(os.path.abspath(__file__)) sys.path.append(ScriptLocation + "/lib") class interpret_SA_URL: def __init__(self,xlsxfile): self.startTime = datetime.now() self.stdCrawler = "/home/maddiekorea/py/stdCrawler.py" self.xlsxfile = xlsxfile self.queries = self.makeQueries() self.commands = self.commandbuild() def timelog(self): consumtime = datetime.now() - self.startTime logging.warning("Completed.\t" + str(consumtime)) def resFile(self): resfile = "test_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv" return resfile def makeQueries(self): df = pd.read_excel(self.xlsxfile) crawlerQueries = [] for i in range(len(df)): data = str(df.iloc[i]['urls']) imTerm = re.sub('^.+sc.queryText\=',"",data) imTerm = re.sub('\&.+$',"",imTerm) imTerm = unquote(imTerm, encoding='euc-kr') imTerm = re.sub('\+'," ",imTerm) if "utm_term" in data: nTerm = re.sub('^.+utm_term\=',"",data) nTerm = re.sub('\&.+$',"",nTerm) nTerm = unquote(nTerm, encoding='utf-8') else: nTerm = re.sub('^.+utm_keyword\=',"",data) nTerm = re.sub('\&.+$',"",nTerm) nTerm = unquote(nTerm, encoding='utf-8') nTerm = re.sub('\+',"",nTerm) nTerm = re.sub('^\$\$',"",nTerm) if nTerm == "http://www.imarket.co.kr/display/malls.do?_method=searchGoods" or nTerm == "http://www.imarket.co.kr/display/malls.do?BIZ_CD=1010187" or nTerm == "http://www.imarket.co.kr/gate/ippgw.jsp?BIZ_CD=1010187" : nTerm = imTerm pimTerm = re.sub('\ ','',imTerm) pnTerm = re.sub('\ ','',nTerm) if pimTerm == pnTerm : crawlerQuery = imTerm else : crawlerQuery = imTerm + ":" + nTerm crawlerQueries.append(crawlerQuery) return crawlerQueries def commandbuild(self): commands=[] for i in range(len(self.queries)): command = "python3 " + self.stdCrawler + " -m SEM -u test -q \"" + str(self.queries[i]) + "\" >> " + self.resFile() commands.append(command) return commands def excution(self,commandpointer): startTime = datetime.now() executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True) executeCrawl.wait() endTime = datetime.now() - startTime logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + "\t" + str(endTime)) sa_url = interpret_SA_URL(str(sys.argv[1])) #print(sa_url.queries) if __name__ == "__main__": with Pool(processes=4) as pool: pool.map(sa_url.excution, range(len(sa_url.commands))) sa_url.timelog()