77 lines
2.6 KiB
Python
77 lines
2.6 KiB
Python
import pandas as pd
|
|
import os,sys, re, logging
|
|
from urllib import parse
|
|
from urllib.parse import unquote
|
|
from datetime import datetime
|
|
from multiprocessing import Process, Pool
|
|
import subprocess
|
|
|
|
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.append(ScriptLocation + "/lib")
|
|
|
|
class interpret_SA_URL:
|
|
def __init__(self,xlsxfile):
|
|
self.startTime = datetime.now()
|
|
self.stdCrawler = "/home/maddiekorea/py/stdCrawler.py"
|
|
self.xlsxfile = xlsxfile
|
|
self.queries = self.makeQueries()
|
|
self.commands = self.commandbuild()
|
|
|
|
def timelog(self):
|
|
consumtime = datetime.now() - self.startTime
|
|
logging.warning("Completed.\t" + str(consumtime))
|
|
|
|
def resFile(self):
|
|
resfile = "test_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
|
|
return resfile
|
|
|
|
def makeQueries(self):
|
|
df = pd.read_excel(self.xlsxfile)
|
|
crawlerQueries = []
|
|
for i in range(len(df)):
|
|
data = str(df.iloc[i]['urls'])
|
|
imTerm = re.sub('^.+sc.queryText\=',"",data)
|
|
imTerm = re.sub('\&.+$',"",imTerm)
|
|
imTerm = unquote(imTerm, encoding='euc-kr')
|
|
imTerm = re.sub('\+'," ",imTerm)
|
|
if "utm_term" in data:
|
|
nTerm = re.sub('^.+utm_term\=',"",data)
|
|
nTerm = re.sub('\&.+$',"",nTerm)
|
|
nTerm = unquote(nTerm, encoding='utf-8')
|
|
else:
|
|
nTerm = re.sub('^.+utm_keyword\=',"",data)
|
|
nTerm = re.sub('\&.+$',"",nTerm)
|
|
nTerm = unquote(nTerm, encoding='utf-8')
|
|
nTerm = re.sub('\+',"",nTerm)
|
|
nTerm = re.sub('^\$\$',"",nTerm)
|
|
if nTerm == "http://www.imarket.co.kr/display/malls.do?_method=searchGoods" or nTerm == "http://www.imarket.co.kr/display/malls.do?BIZ_CD=1010187" or nTerm == "http://www.imarket.co.kr/gate/ippgw.jsp?BIZ_CD=1010187" :
|
|
nTerm = imTerm
|
|
pimTerm = re.sub('\ ','',imTerm)
|
|
pnTerm = re.sub('\ ','',nTerm)
|
|
if pimTerm == pnTerm : crawlerQuery = imTerm
|
|
else : crawlerQuery = imTerm + ":" + nTerm
|
|
crawlerQueries.append(crawlerQuery)
|
|
return crawlerQueries
|
|
|
|
def commandbuild(self):
|
|
commands=[]
|
|
for i in range(len(self.queries)):
|
|
command = "python3 " + self.stdCrawler + " -m SEM -u test -q \"" + str(self.queries[i]) + "\" >> " + self.resFile()
|
|
commands.append(command)
|
|
return commands
|
|
|
|
def excution(self,commandpointer):
|
|
startTime = datetime.now()
|
|
executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True)
|
|
executeCrawl.wait()
|
|
endTime = datetime.now() - startTime
|
|
logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + "\t" + str(endTime))
|
|
|
|
sa_url = interpret_SA_URL(str(sys.argv[1]))
|
|
|
|
#print(sa_url.queries)
|
|
if __name__ == "__main__":
|
|
with Pool(processes=4) as pool:
|
|
pool.map(sa_url.excution, range(len(sa_url.commands)))
|
|
sa_url.timelog()
|