python_apps/crwlers/lib/imagemagickWrapper.py
2023-11-03 14:49:12 +09:00

108 lines
5.0 KiB
Python

import pandas as pd
import requests, bs4, urllib, sys, re, math, logging, os
from datetime import datetime
import subprocess
import pathlib
class imagemagickWrapper:
def __init__(self,logfile):
self.startTime = datetime.now()
self.targetDirectory = '/home/maddiekorea/imgs/crawl'
self.srcfile = logfile
self.resfile = self.convertLogFile()
self.df = self.dataFrame()
self.convertCommands = self.checkImages()
def dataFrame(self):
df = pd.read_csv(self.srcfile, sep='\t', header=0)
return df
def timeLog(self):
consumtime = datetime.now() - self.startTime
logging.warning("Converting Completed.\t" + str(consumtime))
def convertLogFile(self):
t = self.srcfile.split(".")
index = len(t) - 2
resfile = t[index] + "_convert.tsv"
return resfile
def getExtension(self,filename):
ext = filename.split(".")
index = len(ext) - 1
extension = ext[index].upper()
return extension
def dirname(self,projectname,mode):
dirname = "_" + projectname + "_" + mode + "_"
tail = re.sub("imgCrawl\_","",self.srcfile)
tail = re.sub("\.tsv","",tail)
return str(dirname + tail)
def logging(self,errormsg):
errorCmd = "echo \"" + errormsg + "\" >> " + self.convertLogFile() + ";"
loggingError = subprocess.Popen(errorCmd,stdout=subprocess.PIPE,shell=True)
loggingError.wait()
def execution(self,commandpointer):
startTime = datetime.now()
executeCrawl = subprocess.Popen(self.convertCommands[commandpointer],stdout=subprocess.PIPE,shell=True)
executeCrawl.wait()
endTime = datetime.now() - startTime
logging.warning("Converting... : " + str(commandpointer + 1) + "/" + str(len(self.convertCommands)) + "\t" + str(endTime))
def checkImages(self):
commands = []
for i in range(len(self.df)):
projectname = str(self.df.iloc[i]['프로젝트'])
mode = str(self.df.iloc[i]['모드'])
kind = str(self.df.iloc[i]['유형'])
prdCode = str(self.df.iloc[i]['기준코드'])
seq = str(self.df.iloc[i]['시퀀스'])
orgUrl = str(self.df.iloc[i]['원URL'])
filename = str(self.df.iloc[i]['파일명'])
convertedfilename = filename.split(".")[0] + ".JPG"
convertingfilename = filename.split(".")[0] + "_c.JPG"
downloadedfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + filename
convertingfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + convertingfilename
convertedfile = self.targetDirectory +"/" + self.dirname(projectname,mode) + "/" + convertedfilename
file = pathlib.Path(downloadedfile)
fileSize = os.stat(downloadedfile).st_size
extension = self.getExtension(filename)
if file.exists():
if fileSize != 0:
if extension == "JPG" or extension == "JEPG" or extension == "PNG":
if kind == "상세":
convertCmd = "convert " + downloadedfile + " -quality 90 -resize \"918x>\" " + convertingfile + "; "
convertCmd = convertCmd + "rm -rf " + downloadedfile + "; mv " + convertingfile + " " + convertedfile
convertCmd = convertCmd + "echo \"" + projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + convertedfilename + "\t" + "successfully Converted." + "\" >> " + self.convertLogFile() + ";"
commands.append(convertCmd)
else:
try:
identify = subprocess.check_output("identify " + downloadedfile, shell=True)
identify = identify.decode()
identify = re.sub('^[\.\/0-9_A-Za-z\-\ ]+\s.[A-Z]+\s',"",identify)
identify = re.sub('\ .+\n',"",identify)
identify = identify.split("x")
width = int(identify[0])
height = int(identify[1])
maxValue = max(identify)
convertCmd = "convert " + downloadedfile + " -background white -gravity center -extent " + str(maxValue) + "x" + str(maxValue) +" -quality 90 -resize \"1000x>\" " + convertingfile + "; "
convertCmd = convertCmd + "rm -rf " + downloadedfile + "; mv " + convertingfile + " " + convertedfile + "; "
convertCmd = convertCmd + "echo \"" + projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + convertedfilename + "\t" + "successfully Converted." + "\" >> " + self.convertLogFile() + ";"
commands.append(convertCmd)
except subprocess.CalledProcessError:
errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "Converting Error."
self.logging(errorlog)
else:
errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "Unsupported file format."
self.logging(errorlog)
else:
errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "404 error from server"
self.logging(errorlog)
else:
errorlog = projectname + "\t" + mode + "\t" + kind + "\t" + prdCode + "\t" + seq + "\t" + orgUrl + "\t" + filename + "\t" + "file does not exist, crawler error"
self.logging(errorlog)
return commands