python_apps/crwlers/xlsxSAKwChecker.py
2023-11-03 14:49:12 +09:00

77 lines
2.6 KiB
Python

import pandas as pd
import os,sys, re, logging
from urllib import parse
from urllib.parse import unquote
from datetime import datetime
from multiprocessing import Process, Pool
import subprocess
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
sys.path.append(ScriptLocation + "/lib")
class interpret_SA_URL:
def __init__(self,xlsxfile):
self.startTime = datetime.now()
self.stdCrawler = "/home/maddiekorea/py/stdCrawler.py"
self.xlsxfile = xlsxfile
self.queries = self.makeQueries()
self.commands = self.commandbuild()
def timelog(self):
consumtime = datetime.now() - self.startTime
logging.warning("Completed.\t" + str(consumtime))
def resFile(self):
resfile = "test_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
return resfile
def makeQueries(self):
df = pd.read_excel(self.xlsxfile)
crawlerQueries = []
for i in range(len(df)):
data = str(df.iloc[i]['urls'])
imTerm = re.sub('^.+sc.queryText\=',"",data)
imTerm = re.sub('\&.+$',"",imTerm)
imTerm = unquote(imTerm, encoding='euc-kr')
imTerm = re.sub('\+'," ",imTerm)
if "utm_term" in data:
nTerm = re.sub('^.+utm_term\=',"",data)
nTerm = re.sub('\&.+$',"",nTerm)
nTerm = unquote(nTerm, encoding='utf-8')
else:
nTerm = re.sub('^.+utm_keyword\=',"",data)
nTerm = re.sub('\&.+$',"",nTerm)
nTerm = unquote(nTerm, encoding='utf-8')
nTerm = re.sub('\+',"",nTerm)
nTerm = re.sub('^\$\$',"",nTerm)
if nTerm == "http://www.imarket.co.kr/display/malls.do?_method=searchGoods" or nTerm == "http://www.imarket.co.kr/display/malls.do?BIZ_CD=1010187" or nTerm == "http://www.imarket.co.kr/gate/ippgw.jsp?BIZ_CD=1010187" :
nTerm = imTerm
pimTerm = re.sub('\ ','',imTerm)
pnTerm = re.sub('\ ','',nTerm)
if pimTerm == pnTerm : crawlerQuery = imTerm
else : crawlerQuery = imTerm + ":" + nTerm
crawlerQueries.append(crawlerQuery)
return crawlerQueries
def commandbuild(self):
commands=[]
for i in range(len(self.queries)):
command = "python3 " + self.stdCrawler + " -m SEM -u test -q \"" + str(self.queries[i]) + "\" >> " + self.resFile()
commands.append(command)
return commands
def excution(self,commandpointer):
startTime = datetime.now()
executeCrawl = subprocess.Popen(self.commands[commandpointer],stdout=subprocess.PIPE,shell=True)
executeCrawl.wait()
endTime = datetime.now() - startTime
logging.warning(str(commandpointer + 1) + "/" + str(len(self.commands)) + "\t" + str(endTime))
sa_url = interpret_SA_URL(str(sys.argv[1]))
#print(sa_url.queries)
if __name__ == "__main__":
with Pool(processes=4) as pool:
pool.map(sa_url.excution, range(len(sa_url.commands)))
sa_url.timelog()