import pandas as pd import requests, bs4, urllib, sys, re, math, logging, os from datetime import datetime import subprocess import pathlib ScriptLocation = os.path.dirname(os.path.abspath(__file__)) sys.path.append(ScriptLocation) import imarketDetail class wgetWrapper: def __init__(self,xlsxfile): self.startTime = datetime.now() self.xlsxfile = xlsxfile self.df = self.dataFrame() self.targetDirectory = '/home/maddiekorea/imgs/crawl' self.imglist = self.prdImgArray() self.downloadcommands = self.crawlCmd() self.resfile = self.resFile() def timeLog(self): consumtime = datetime.now() - self.startTime logging.warning("Download Completed.\t" + str(consumtime)) def dataFrame(self): df = pd.read_csv(self.xlsxfile, sep='\t', header=0) return df def getImgfromHTML(self,prdCode,mode,action,content,prjname): imgs = self.extractImgfromContent(content) res = [] for i in range(len(imgs)): _img = {} _img['productCode'] = prdCode _img['prjname'] = prjname _img['mode'] = mode _img['action'] = action _img['seq'] = i _img['img'] = imgs[i].get('src') res.append(_img) return res def extractImgfromContent(self,content): if type(content) != str : content = "" content = content.replace("alt= src","alt=\"\" src") bsCheck = bool(bs4.BeautifulSoup(content, "html.parser").find()) if bsCheck == True : bs = bs4.BeautifulSoup(content, 'lxml') imgs = bs.select('img') else : content = "" bs = bs4.BeautifulSoup(content, 'lxml') imgs = bs.select('img') return imgs def prdImgArray(self): img = [] for i in range(len(self.df)): _req = {} _req['prdCode'] = str(self.df.iloc[i]['기준코드']) _req['content'] = str(self.df.iloc[i]['상세정보']) _req['action'] = str(self.df.iloc[i]['action']) _req['mode'] = str(self.df.iloc[i]['모드']) _req['prjname'] = str(self.df.iloc[i]['프로젝트명']) if _req['action'] == "check": _img = imarketDetail.imarketDetail(_req['prjname'],_req['mode'],_req['prdCode']).getImgArray() img.append(_img) if _req['action'] == "new": _img = self.getImgfromHTML(_req['prdCode'],_req['mode'],_req['action'],_req['content'],_req['prjname']) img.append(_img) tasks = [] for i in range(len(img)): for j in range(len(img[i])): tasks.append(img[i][j]) return tasks def resFile(self): resfile = "imgCrawl_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv" return resfile def dirname(self,prjname,action): directory = self.targetDirectory + "/_" + str(prjname) + '_' + str(action) + '_' + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + "/" if not os.path.exists(directory) : os.makedirs(directory) return directory def getExtension(self,filename): ext = filename.split(".") index = len(ext) - 1 extension = ext[index].upper() return extension def filename(self,img,prdCode,mode,seq,convert): extension = self.getExtension(img) fcmode = "" fnmode = "_" if mode == "상세" : fnmode = "_d_" if convert == "Y" : fcmode = "_convert" extension = "JPG" fileName = str(prdCode) + fnmode + str(seq) + fcmode + "." + extension return fileName def writeHeader(self): resHeader = "프로젝트" + "\t" + "모드" + "\t" + "유형" + "\t" + "기준코드" + "\t" + "시퀀스" + "\t" + "원URL" + "\t" + "파일명" insertheader = "echo \"" + resHeader + "\" > " + self.resFile() executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True) executeinsertHeader.wait() def crawlCmd(self): commands = [] ar = self.imglist for i in range(len(ar)): downloadfile = self.dirname(ar[i]['prjname'],ar[i]['action']) + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N") orgURL = ar[i]['img'] command = "wget -q --tries=2 --timeout=10 -o /dev/null -O \"" + downloadfile + "\" \"" + orgURL + "\";" logformat = str(ar[i]['prjname']) + "\t" + str(ar[i]['action']) + "\t" + str(ar[i]['mode']) + "\t" + str(ar[i]['productCode']) + "\t" + str(ar[i]['seq']) + "\t" + str(ar[i]['img']) +"\t" + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N") logcommand = "echo \"" + logformat + "\" >> " + self.resFile() + ";" command = command + logcommand commands.append(command) return commands def execution(self,commandpointer): startTime = datetime.now() executeCrawl = subprocess.Popen(self.downloadcommands[commandpointer],stdout=subprocess.PIPE,shell=True) executeCrawl.wait() endTime = datetime.now() - startTime logging.warning("Downloading... : " + str(commandpointer + 1) + "/" + str(len(self.crawlCmd())) + "\t" + str(endTime)) ########################################