python_apps/crwlers/xlsximgCrawler.py
2023-11-03 14:49:12 +09:00

54 lines
1.9 KiB
Python

#!/Users/maddiekorea/Workspace/bin/python
#version 202104
#excel img crawler
import pandas as pd
import requests, bs4, urllib, sys, re, math, logging, os
from datetime import datetime
import subprocess
xlsxFile = str(sys.argv[1])
crawlorNot = str(sys.argv[2])
df = pd.read_excel(xlsxFile)
srcName = df.iloc[0]['공급사명']
for i in range(len(df)) :
productCode = df.iloc[i]['기준코드']
content = df.iloc[i]['상세설명']
#냉무보정
if type(content) != str :
content = "<img src=\"noneContent.img\" />"
#html 오류 수정 (alt unescape)
content = content.replace("alt= src","alt=\"\" src")
bs = bs4.BeautifulSoup(content, 'lxml')
img = []
imgs = bs.select('img')
for i in range(len(imgs)) :
dimg = imgs[i].get('src')
img.insert(i,dimg)
for i in range(len(img)) :
ext = img[i].split(".")
index = len(ext) - 1
extension = ext[index].upper()
directory = '/home/maddiekorea/imgs/crawl/_' + srcName + '_' + datetime.now().strftime("%Y%m%d")
if not os.path.exists(directory) : os.makedirs(directory)
fileName = str(productCode) + "_d_" + str(i) + "." + extension
if img[i] != "noneContent.img" :
if crawlorNot != "N" :
shellcmd = "wget -q --tries=2 --timeout=10 -o /dev/null -O " + directory + "/" + fileName + " \"" + img[i] + "\""
executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True)
executeCrawl.wait()
#convertCmd = "convert " + directory + "/" + fileName + " -quality 90 -resize 920x\> " + directory + "/" + str(productCode) + "_dc_" + str(i) + ".JPG"
#executeConvert = subprocess.Popen(convertCmd,stdout=subprocess.PIPE,shell=True)
print(srcName + "\t" + str(productCode) + "\t" + str(img[i]) + "\t" + fileName)