131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
import pandas as pd
|
|
import requests, bs4, urllib, sys, re, math, logging, os
|
|
from datetime import datetime
|
|
import subprocess
|
|
import pathlib
|
|
|
|
ScriptLocation = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.append(ScriptLocation)
|
|
import imarketDetail
|
|
|
|
class wgetWrapper:
|
|
def __init__(self,xlsxfile):
|
|
self.startTime = datetime.now()
|
|
self.xlsxfile = xlsxfile
|
|
self.df = self.dataFrame()
|
|
self.targetDirectory = '/home/maddiekorea/imgs/crawl'
|
|
self.imglist = self.prdImgArray()
|
|
self.downloadcommands = self.crawlCmd()
|
|
self.resfile = self.resFile()
|
|
|
|
def timeLog(self):
|
|
consumtime = datetime.now() - self.startTime
|
|
logging.warning("Download Completed.\t" + str(consumtime))
|
|
|
|
def dataFrame(self):
|
|
df = pd.read_csv(self.xlsxfile, sep='\t', header=0)
|
|
return df
|
|
|
|
def getImgfromHTML(self,prdCode,mode,action,content,prjname):
|
|
imgs = self.extractImgfromContent(content)
|
|
res = []
|
|
for i in range(len(imgs)):
|
|
_img = {}
|
|
_img['productCode'] = prdCode
|
|
_img['prjname'] = prjname
|
|
_img['mode'] = mode
|
|
_img['action'] = action
|
|
_img['seq'] = i
|
|
_img['img'] = imgs[i].get('src')
|
|
res.append(_img)
|
|
return res
|
|
|
|
def extractImgfromContent(self,content):
|
|
if type(content) != str : content = "<img src=\"noneContent.img\" />"
|
|
content = content.replace("alt= src","alt=\"\" src")
|
|
bsCheck = bool(bs4.BeautifulSoup(content, "html.parser").find())
|
|
if bsCheck == True :
|
|
bs = bs4.BeautifulSoup(content, 'lxml')
|
|
imgs = bs.select('img')
|
|
else :
|
|
content = "<img src=\"" + content + "\" />"
|
|
bs = bs4.BeautifulSoup(content, 'lxml')
|
|
imgs = bs.select('img')
|
|
return imgs
|
|
|
|
def prdImgArray(self):
|
|
img = []
|
|
for i in range(len(self.df)):
|
|
_req = {}
|
|
_req['prdCode'] = str(self.df.iloc[i]['기준코드'])
|
|
_req['content'] = str(self.df.iloc[i]['상세정보'])
|
|
_req['action'] = str(self.df.iloc[i]['action'])
|
|
_req['mode'] = str(self.df.iloc[i]['모드'])
|
|
_req['prjname'] = str(self.df.iloc[i]['프로젝트명'])
|
|
if _req['action'] == "check":
|
|
_img = imarketDetail.imarketDetail(_req['prjname'],_req['mode'],_req['prdCode']).getImgArray()
|
|
img.append(_img)
|
|
if _req['action'] == "new":
|
|
_img = self.getImgfromHTML(_req['prdCode'],_req['mode'],_req['action'],_req['content'],_req['prjname'])
|
|
img.append(_img)
|
|
tasks = []
|
|
for i in range(len(img)):
|
|
for j in range(len(img[i])):
|
|
tasks.append(img[i][j])
|
|
return tasks
|
|
|
|
def resFile(self):
|
|
resfile = "imgCrawl_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
|
|
return resfile
|
|
|
|
def dirname(self,prjname,action):
|
|
directory = self.targetDirectory + "/_" + str(prjname) + '_' + str(action) + '_' + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + "/"
|
|
if not os.path.exists(directory) : os.makedirs(directory)
|
|
return directory
|
|
|
|
def getExtension(self,filename):
|
|
ext = filename.split(".")
|
|
index = len(ext) - 1
|
|
extension = ext[index].upper()
|
|
return extension
|
|
|
|
def filename(self,img,prdCode,mode,seq,convert):
|
|
extension = self.getExtension(img)
|
|
fcmode = ""
|
|
fnmode = "_"
|
|
if mode == "상세" : fnmode = "_d_"
|
|
if convert == "Y" :
|
|
fcmode = "_convert"
|
|
extension = "JPG"
|
|
fileName = str(prdCode) + fnmode + str(seq) + fcmode + "." + extension
|
|
return fileName
|
|
|
|
def writeHeader(self):
|
|
resHeader = "프로젝트" + "\t" + "모드" + "\t" + "유형" + "\t" + "기준코드" + "\t" + "시퀀스" + "\t" + "원URL" + "\t" + "파일명"
|
|
insertheader = "echo \"" + resHeader + "\" > " + self.resFile()
|
|
executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True)
|
|
executeinsertHeader.wait()
|
|
|
|
def crawlCmd(self):
|
|
commands = []
|
|
ar = self.imglist
|
|
for i in range(len(ar)):
|
|
downloadfile = self.dirname(ar[i]['prjname'],ar[i]['action']) + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
|
|
orgURL = ar[i]['img']
|
|
command = "wget -q --tries=2 --timeout=10 -o /dev/null -O \"" + downloadfile + "\" \"" + orgURL + "\";"
|
|
logformat = str(ar[i]['prjname']) + "\t" + str(ar[i]['action']) + "\t" + str(ar[i]['mode']) + "\t" + str(ar[i]['productCode']) + "\t" + str(ar[i]['seq']) + "\t" + str(ar[i]['img']) +"\t" + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
|
|
logcommand = "echo \"" + logformat + "\" >> " + self.resFile() + ";"
|
|
command = command + logcommand
|
|
commands.append(command)
|
|
return commands
|
|
|
|
def execution(self,commandpointer):
|
|
startTime = datetime.now()
|
|
executeCrawl = subprocess.Popen(self.downloadcommands[commandpointer],stdout=subprocess.PIPE,shell=True)
|
|
executeCrawl.wait()
|
|
endTime = datetime.now() - startTime
|
|
logging.warning("Downloading... : " + str(commandpointer + 1) + "/" + str(len(self.crawlCmd())) + "\t" + str(endTime))
|
|
|
|
########################################
|
|
|