python_apps/crwlers/stdCrawler.py
2023-11-03 14:49:12 +09:00

126 lines
4.2 KiB
Python

#!/usr/bin/env python
import os, sys, subprocess, argparse, re
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
sys.path.append(ScriptLocation + "/lib")
def getSummaryResults(inlistrow):
import imarketlib
import naver
#attrDic = {'utm_campaign': inlistrow['utm_campaign'], 'utm_content': inlistrow['utm_content']}
attrDic = { 'utm_campaign': inlistrow['utm_campaign'], 'utm_keyword': inlistrow['nTerm'] }
imarket = imarketlib.imarketGet(inlistrow['term'], inlistrow['mode'], attrDic)
naver = naver.NaverGet(inlistrow['nTerm'])
res = {}
ia = imarket.getSummaryResult()
na = naver.getSummaryResult()
res.update(ia)
res.update(na)
return res
def getImarketCrawl(inlistrow):
import imarketlib
attrDic = {}
imarket = imarketlib.imarketGet(inlistrow['term'], inlistrow['mode'],attrDic)
res = imarket.getfullResult()
return res
def getinlineQuery(**kwargs):
qres = []
querylist = kwargs['query'].split(',')
for i in range(len(querylist)):
para = {}
termAr = querylist[i].split(':') # :로 텀을 분기할 수 있다. 앞엣것은 아이마켓 뒤엣것은 광고
if len(termAr) <= 1:
para['term'] = querylist[i]
para['nTerm'] = querylist[i].replace(" ","")
else:
para['term'] = termAr[0]
para['nTerm'] = termAr[1]
for key,value in kwargs.items():
if key != 'query':
para[key] = value
qres.append(para)
return qres
def GoCrawl(inlist):
res = []
for i in range(len(inlist)):
if inlist[i]['mode'] == "SEM":
resdic = getSummaryResults(inlist[i])
res.append(resdic)
else:
resdic = getImarketCrawl(inlist[i])
res.append(resdic)
return res
def goSearchTsv(inlist) :
res = ''
for i in range(len(inlist)) :
for j in range(len(inlist[i])) :
for key,value in inlist[i][j].items():
res = res + key + "\t"
res = res + "\n"
for key,value in inlist[i][j].items():
res = res + str(value) + "\t"
res = res + "\n"
return res
def goSemTsv(inlist) :
res = ''
for i in range(len(inlist)) :
#if i == 0:
#for key,value in inlist[i].items():
# res = res + key + "\t"
#res = res +"\n"
for key,value in inlist[i].items():
res = res + str(value) + "\t"
return res
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--mode', type=str, required=True, help="select crawling mode: SEM, search, category")
#parser.add_argument('-r', '--resultType', type=str, help="select full or summary")
parser.add_argument('-q', '--query', type=str, help="query array separated by ',', not with -i option")
parser.add_argument('-u', '--utm_campaign', type=str, help="campaign name required by SEM mode")
#parser.add_argument('-t', '--utm_content', type=str, help="content name optionally required by SEM mode")
parser.add_argument('-o', '--output', type=str, help="output format")
parargs = parser.parse_args()
if parargs.query == None :
print("error: plz insert queries")
elif parargs.mode == None :
print("plz set mode. SEM or search")
else:
if parargs.mode == "SEM":
if parargs.utm_campaign == None:
print("error: plz add utm_campaign parameter by adding -u")
else :
res = getinlineQuery(query = parargs.query,
resultype = 'summary',
mode = parargs.mode,
utm_campaign = parargs.utm_campaign
)
#utm_content = parargs.utm_content)
result = GoCrawl(res)
elif parargs.mode == "search":
res = getinlineQuery(query = parargs.query,
resultType = 'full',
mode = parargs.mode)
result = GoCrawl(res)
if parargs.output == 'json' :
print(result)
elif parargs.output == 'db' :
import pymysql
conn = pymysql.connect(host='localhost', user='maddiekorea', password='mad(#lin',db='maddiekorea', unix_socket='/var/run/mysqld/mysqld.sock', charset='utf8')
if parargs.mode == 'search' :
print(goSearchTsv(result))
else :
print(goSemTsv(result))
else :
if parargs.mode == 'search' :
print(goSearchTsv(result))
else :
print(goSemTsv(result))