54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
#!/Users/maddiekorea/Workspace/bin/python
|
|
#version 202104
|
|
#excel img crawler
|
|
|
|
import pandas as pd
|
|
import requests, bs4, urllib, sys, re, math, logging, os
|
|
from datetime import datetime
|
|
import subprocess
|
|
|
|
xlsxFile = str(sys.argv[1])
|
|
crawlorNot = str(sys.argv[2])
|
|
|
|
df = pd.read_excel(xlsxFile)
|
|
srcName = df.iloc[0]['공급사명']
|
|
|
|
for i in range(len(df)) :
|
|
productCode = df.iloc[i]['기준코드']
|
|
content = df.iloc[i]['상세설명']
|
|
|
|
#냉무보정
|
|
if type(content) != str :
|
|
content = "<img src=\"noneContent.img\" />"
|
|
|
|
#html 오류 수정 (alt unescape)
|
|
content = content.replace("alt= src","alt=\"\" src")
|
|
|
|
bs = bs4.BeautifulSoup(content, 'lxml')
|
|
|
|
img = []
|
|
imgs = bs.select('img')
|
|
|
|
for i in range(len(imgs)) :
|
|
dimg = imgs[i].get('src')
|
|
img.insert(i,dimg)
|
|
|
|
for i in range(len(img)) :
|
|
ext = img[i].split(".")
|
|
index = len(ext) - 1
|
|
extension = ext[index].upper()
|
|
|
|
directory = '/home/maddiekorea/imgs/crawl/_' + srcName + '_' + datetime.now().strftime("%Y%m%d")
|
|
if not os.path.exists(directory) : os.makedirs(directory)
|
|
|
|
fileName = str(productCode) + "_d_" + str(i) + "." + extension
|
|
|
|
if img[i] != "noneContent.img" :
|
|
if crawlorNot != "N" :
|
|
shellcmd = "wget -q --tries=2 --timeout=10 -o /dev/null -O " + directory + "/" + fileName + " \"" + img[i] + "\""
|
|
executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True)
|
|
executeCrawl.wait()
|
|
#convertCmd = "convert " + directory + "/" + fileName + " -quality 90 -resize 920x\> " + directory + "/" + str(productCode) + "_dc_" + str(i) + ".JPG"
|
|
#executeConvert = subprocess.Popen(convertCmd,stdout=subprocess.PIPE,shell=True)
|
|
print(srcName + "\t" + str(productCode) + "\t" + str(img[i]) + "\t" + fileName)
|
|
|