python_apps/crwlers/lib/wgetWrapper.py

import pandas as pd
import requests, bs4, urllib, sys, re, math, logging, os
from datetime import datetime
import subprocess
import pathlib

ScriptLocation = os.path.dirname(os.path.abspath(__file__))
sys.path.append(ScriptLocation)
import imarketDetail

class wgetWrapper:
	def __init__(self,xlsxfile):
		self.startTime = datetime.now()
		self.xlsxfile = xlsxfile
		self.df = self.dataFrame()
		self.targetDirectory = '/home/maddiekorea/imgs/crawl'
		self.imglist = self.prdImgArray()
		self.downloadcommands = self.crawlCmd()
		self.resfile = self.resFile()

	def timeLog(self):
		consumtime = datetime.now() - self.startTime
		logging.warning("Download Completed.\t" + str(consumtime))

	def dataFrame(self):
		df = pd.read_csv(self.xlsxfile, sep='\t', header=0)
		return df

	def getImgfromHTML(self,prdCode,mode,action,content,prjname):
		imgs = self.extractImgfromContent(content)
		res = []
		for i in range(len(imgs)):
			_img = {}
			_img['productCode'] = prdCode
			_img['prjname'] = prjname
			_img['mode'] = mode
			_img['action'] = action
			_img['seq'] = i
			_img['img'] = imgs[i].get('src')
			res.append(_img)
		return res

	def extractImgfromContent(self,content):
		if type(content) != str : content = "<img src=\"noneContent.img\" />"
		content = content.replace("alt= src","alt=\"\" src")
		bsCheck = bool(bs4.BeautifulSoup(content, "html.parser").find())
		if bsCheck == True :
			bs = bs4.BeautifulSoup(content, 'lxml')
			imgs = bs.select('img')
		else :
			content = "<img src=\"" + content + "\" />"
			bs = bs4.BeautifulSoup(content, 'lxml')
			imgs = bs.select('img')
		return imgs

	def prdImgArray(self):
		img = []
		for i in range(len(self.df)):
			_req = {}
			_req['prdCode'] = str(self.df.iloc[i]['기준코드'])
			_req['content'] = str(self.df.iloc[i]['상세정보'])
			_req['action'] = str(self.df.iloc[i]['action'])
			_req['mode'] = str(self.df.iloc[i]['모드'])
			_req['prjname'] = str(self.df.iloc[i]['프로젝트명'])
			if _req['action'] == "check":
				_img = imarketDetail.imarketDetail(_req['prjname'],_req['mode'],_req['prdCode']).getImgArray()
				img.append(_img)
			if _req['action'] == "new":
				_img = self.getImgfromHTML(_req['prdCode'],_req['mode'],_req['action'],_req['content'],_req['prjname'])
				img.append(_img)
		tasks = []
		for i in range(len(img)):
			for j in range(len(img[i])):
				tasks.append(img[i][j])
		return tasks

	def resFile(self):
		resfile = "imgCrawl_" + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + ".tsv"
		return resfile

	def dirname(self,prjname,action):
		directory = self.targetDirectory + "/_" + str(prjname) + '_' + str(action) + '_' + str(self.startTime.strftime("%Y%m%d-%H_%M_%s")) + "/"
		if not os.path.exists(directory) : os.makedirs(directory)
		return directory

	def getExtension(self,filename):
		ext = filename.split(".")
		index = len(ext) - 1
		extension = ext[index].upper()
		return extension

	def filename(self,img,prdCode,mode,seq,convert):
		extension = self.getExtension(img)
		fcmode = ""
		fnmode = "_"
		if mode == "상세" : fnmode = "_d_"
		if convert == "Y" :
			fcmode = "_convert"
			extension = "JPG"
		fileName = str(prdCode) + fnmode + str(seq) + fcmode + "." + extension
		return fileName

	def writeHeader(self):
		resHeader = "프로젝트" + "\t" + "모드" + "\t" + "유형" + "\t" + "기준코드" + "\t" +  "시퀀스" + "\t" + "원URL" + "\t" + "파일명"
		insertheader = "echo \"" + resHeader + "\" > " + self.resFile()
		executeinsertHeader = subprocess.Popen(insertheader,stdout=subprocess.PIPE,shell=True)
		executeinsertHeader.wait()

	def crawlCmd(self):
		commands = []
		ar = self.imglist
		for i in range(len(ar)):
			downloadfile = self.dirname(ar[i]['prjname'],ar[i]['action']) + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
			orgURL = ar[i]['img']
			command = "wget -q --tries=2 --timeout=10 -o /dev/null -O \"" + downloadfile + "\" \"" + orgURL + "\";"
			logformat = str(ar[i]['prjname']) + "\t" + str(ar[i]['action']) + "\t" + str(ar[i]['mode']) + "\t" + str(ar[i]['productCode']) + "\t" + str(ar[i]['seq']) + "\t" + str(ar[i]['img']) +"\t" + self.filename(ar[i]['img'],ar[i]['productCode'],ar[i]['mode'],ar[i]['seq'],"N")
			logcommand = "echo \"" + logformat + "\" >> " + self.resFile() + ";"
			command = command + logcommand
			commands.append(command)
		return commands

	def execution(self,commandpointer):
		startTime = datetime.now()
		executeCrawl = subprocess.Popen(self.downloadcommands[commandpointer],stdout=subprocess.PIPE,shell=True)
		executeCrawl.wait()
		endTime = datetime.now() - startTime
		logging.warning("Downloading... : " + str(commandpointer + 1) + "/" + str(len(self.crawlCmd())) + "\t" + str(endTime))

########################################