import pandas as pd import requests, bs4, urllib, sys, re, math, logging, os from datetime import datetime import subprocess import pathlib class imagemagickWrapper: def __init__(self,logfile): self.startTime = datetime.now() self.targetDirectory = '/home/maddiekorea/imgs/crawl' self.srcfile = logfile self.resfile = self.convertLogFile() self.df = self.dataFrame() self.convertCommands = self.checkImages() def dataFrame(self): df = pd.read_csv(self.srcfile, sep='\t', header=0) return df def timeLog(self): consumtime = datetime.now() - self.startTime logging.warning("Converting Completed.\t" + str(consumtime)) def convertLogFile(self): t = self.srcfile.split(".") index = len(t) - 2 resfile = t[index] + "_convert.tsv" return resfile def getExtension(self,filename): ext = filename.split(".") index = len(ext) - 1 extension = ext[index].upper() return extension def dirname(self,projectname,mode): dirname = "_" + projectname + "_" + mode + "_" tail = re.sub("imgCrawl\_","",self.srcfile) tail = re.sub("\.tsv","",tail) return str(dirname + tail) def logging(self,errormsg): errorCmd = "echo \"" + errormsg + "\" >> " + self.convertLogFile() + ";" loggingError = subprocess.Popen(errorCmd,stdout=subprocess.PIPE,shell=True) loggingError.wait() def execution(self,commandpointer): startTime = datetime.now() executeCrawl = subprocess.Popen(self.convertCommands[commandpointer],stdout=subprocess.PIPE,shell=True) executeCrawl.wait() endTime = datetime.now() - startTime logging.warning("Converting... : " + str(commandpointer + 1) + "/" + str(len(self.convertCommands)) + "\t" + str(endTime)) def checkImages(self): commands = [] for i in range(len(self.df)): projectname = str(self.df.iloc[i]['프로젝트']) mode = str(self.df.iloc[i]['모드']) kind = str(self.df.iloc[i]['유형']) prdCode = str(self.df.iloc[i]['기준코드']) seq = str(self.df.iloc[i]['시퀀스']) orgUrl = str(self.df.iloc[i]['원URL']) filename = str(self.df.iloc[i]['파일명']) convertedfilename = filename.split(".")[0] + ".JPG" convertingfilename = filename.split(".")[0] + "_c.JPG" downloadedfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + filename convertingfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + convertingfilename convertedfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + convertedfilename file = pathlib.Path(downloadedfile) fileSize = os.stat(downloadedfile).st_size extension = self.getExtension(filename) if file.exists(): if fileSize != 0: if extension == "JPG" or extension == "JEPG" or extension == "PNG": if kind == "상세": convertCmd = "convert " + downloadedfile + " -quality 90 -resize \"918x>\" " + convertingfile + "; " convertCmd = convertCmd + "rm -rf " + downloadedfile + "; mv " + convertingfile + " " + convertedfile convertCmd = convertCmd + "echo \"" + projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + convertedfilename + "\t" + "successfully Converted." + "\" >> " + self.convertLogFile() + ";" commands.append(convertCmd) else: try: identify = subprocess.check_output("identify " + downloadedfile, shell=True) identify = identify.decode() identify = re.sub('^[\.\/0-9_A-Za-z\-\ ]+\s.[A-Z]+\s',"",identify) identify = re.sub('\ .+\n',"",identify) identify = identify.split("x") width = int(identify[0]) height = int(identify[1]) maxValue = max(identify) convertCmd = "convert " + downloadedfile + " -background white -gravity center -extent " + str(maxValue) + "x" + str(maxValue) +" -quality 90 -resize \"1000x>\" " + convertingfile + "; " convertCmd = convertCmd + "rm -rf " + downloadedfile + "; mv " + convertingfile + " " + convertedfile + "; " convertCmd = convertCmd + "echo \"" + projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + convertedfilename + "\t" + "successfully Converted." + "\" >> " + self.convertLogFile() + ";" commands.append(convertCmd) except subprocess.CalledProcessError: errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "Converting Error." self.logging(errorlog) else: errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "Unsupported file format." self.logging(errorlog) else: errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "404 error from server" self.logging(errorlog) else: errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "file does not exist, crawler error" self.logging(errorlog) return commands