import pandas as pd
import requests, bs4, urllib, sys, re, math, logging, os
from datetime import datetime
import subprocess
import pathlib
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
sys.path.append(ScriptLocation)
import imarketDetail
class wgetWrapper:
def __init__(self,xlsxfile):
self.startTime = datetime.now()
self.xlsxfile = xlsxfile
self.df = self.dataFrame()
self.targetDirectory = '/home/maddiekorea/imgs/crawl'
self.imglist = self.prdImgArray()
self.downloadcommands = self.crawlCmd()
self.resfile = self.resFile()
def timeLog(self):
consumtime = datetime.now() - self.startTime
logging.warning("Download Completed.\t" + str(consumtime))
def dataFrame(self):
df = pd.read_csv(self.xlsxfile, sep='\t', header=0)
return df
def getImgfromHTML(self,prdCode,mode,action,content,prjname):
imgs = self.extractImgfromContent(content)
res = []
for i in range(len(imgs)):
_img = {}
_img['productCode'] = prdCode
_img['prjname'] = prjname
_img['mode'] = mode
_img['action'] = action
_img['seq'] = i
_img['img'] = imgs[i].get('src')
res.append(_img)
return res
def extractImgfromContent(self,content):
if type(content) != str : content = "
"
content = content.replace("alt= src","alt=\"\" src")
bsCheck = bool(bs4.BeautifulSoup(content, "html.parser").find())
if bsCheck == True :
bs = bs4.BeautifulSoup(content, 'lxml')
imgs = bs.select('img')
else :
content = "
"
bs = bs4.BeautifulSoup(content, 'lxml')
imgs = bs.select('img')
return imgs
def prdImgArray(self):
img = []
for i in range(len(self.df)):
_req = {}
_req['prdCode'] = str(self.df.iloc[i]['기준코드'])
_req['content'] = str(self.df.iloc[i]['상세정보'])
_req['action'] = str(self.df.iloc[i]['action'])
_req['mode'] = str(self.df.iloc[i]['모드'])
_req['prjname'] = str(self.df.iloc[i]['프로젝트명'])
if _req['action'] == "check":
_img = imarketDetail.imarketDetail(_req['prjname'],_req['mode'],_req['prdCode']).getImgArray()
img.append(_img)
if _req['action'] == "new":
_img = self.getImgfromHTML(_req['prdCode'],_req['mode'],_req['action'],_req['content'],_req['prjname'])
img.append(_img)
tasks = []
for i in range(len(img)):
for j in range(len(img[i])):
tasks.append(img[i][j])
return tasks
def resFile(self):
resfile = "imgCrawl_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
return resfile
def dirname(self,prjname,action):
directory = self.targetDirectory + "/_" + str(prjname) + '_' + str(action) + '_' + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + "/"
if not os.path.exists(directory) : os.makedirs(directory)
return directory
def getExtension(self,filename):
ext = filename.split(".")
index = len(ext) - 1
extension = ext[index].upper()
return extension
def filename(self,img,prdCode,mode,seq,convert):
extension = self.getExtension(img)
fcmode = ""
fnmode = "_"
if mode == "상세" : fnmode = "_d_"
if convert == "Y" :
fcmode = "_convert"
extension = "JPG"
fileName = str(prdCode) + fnmode + str(seq) + fcmode + "." + extension
return fileName
def writeHeader(self):
resHeader = "프로젝트" + "\t" + "모드" + "\t" + "유형" + "\t" + "기준코드" + "\t" + "시퀀스" + "\t" + "원URL" + "\t" + "파일명"
insertheader = "echo \"" + resHeader + "\" > " + self.resFile()
executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True)
executeinsertHeader.wait()
def crawlCmd(self):
commands = []
ar = self.imglist
for i in range(len(ar)):
downloadfile = self.dirname(ar[i]['prjname'],ar[i]['action']) + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
orgURL = ar[i]['img']
command = "wget -q --tries=2 --timeout=10 -o /dev/null -O \"" + downloadfile + "\" \"" + orgURL + "\";"
logformat = str(ar[i]['prjname']) + "\t" + str(ar[i]['action']) + "\t" + str(ar[i]['mode']) + "\t" + str(ar[i]['productCode']) + "\t" + str(ar[i]['seq']) + "\t" + str(ar[i]['img']) +"\t" + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
logcommand = "echo \"" + logformat + "\" >> " + self.resFile() + ";"
command = command + logcommand
commands.append(command)
return commands
def execution(self,commandpointer):
startTime = datetime.now()
executeCrawl = subprocess.Popen(self.downloadcommands[commandpointer],stdout=subprocess.PIPE,shell=True)
executeCrawl.wait()
endTime = datetime.now() - startTime
logging.warning("Downloading... : " + str(commandpointer + 1) + "/" + str(len(self.crawlCmd())) + "\t" + str(endTime))
########################################