#!/Users/maddiekorea/Workspace/bin/python #version 202104 #excel img crawler import pandas as pd import requests, bs4, urllib, sys, re, math, logging, os from datetime import datetime import subprocess xlsxFile = str(sys.argv[1]) crawlorNot = str(sys.argv[2]) df = pd.read_excel(xlsxFile) srcName = df.iloc[0]['공급사명'] for i in range(len(df)) : productCode = df.iloc[i]['기준코드'] content = df.iloc[i]['상세설명'] #냉무보정 if type(content) != str : content = "" #html 오류 수정 (alt unescape) content = content.replace("alt= src","alt=\"\" src") bs = bs4.BeautifulSoup(content, 'lxml') img = [] imgs = bs.select('img') for i in range(len(imgs)) : dimg = imgs[i].get('src') img.insert(i,dimg) for i in range(len(img)) : ext = img[i].split(".") index = len(ext) - 1 extension = ext[index].upper() directory = '/home/maddiekorea/imgs/crawl/_' + srcName + '_' + datetime.now().strftime("%Y%m%d") if not os.path.exists(directory) : os.makedirs(directory) fileName = str(productCode) + "_d_" + str(i) + "." + extension if img[i] != "noneContent.img" : if crawlorNot != "N" : shellcmd = "wget -q --tries=2 --timeout=10 -o /dev/null -O " + directory + "/" + fileName + " \"" + img[i] + "\"" executeCrawl = subprocess.Popen(shellcmd,stdout=subprocess.PIPE,shell=True) executeCrawl.wait() #convertCmd = "convert " + directory + "/" + fileName + " -quality 90 -resize 920x\> " + directory + "/" + str(productCode) + "_dc_" + str(i) + ".JPG" #executeConvert = subprocess.Popen(convertCmd,stdout=subprocess.PIPE,shell=True) print(srcName + "\t" + str(productCode) + "\t" + str(img[i]) + "\t" + fileName)