# encoding: utf-8##pip install requestsimport requests#pip install BeautifulSoup4import bs4import ti
# encoding: utf-8##pip install requestsimport requests#pip install BeautifulSoup4import bs4import timeimport osimport sysclass Spider(object): ''' 爬虫下载类 类变量: proxies = 代理 # 例如使用ss时: proxies = {'http':'http://127.0.0.1:1080'} ''' def __init__(self,proxies=None): self.proxies = proxies self.headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'} def __get_all_cataurl(self,catalogue_url): ''' 获取初始目录中的所有目录页面的url ''' __all = [] respond = requests.get(catalogue_url,proxies=self.proxies,headers=self.headers) soup = bs4.BeautifulSoup(respond.text,'html.parser') dls = soup.find_all(class_='ptt') if dls is None: __all.append(catalogue_url) for dl in dls: links = dl.find_all('a') for l in links: if l.string.isdigit(): __all.append(l['href']) return __all def __get_all_pic(self,cataurl): ''' 获取一个目录页面中的所有图片 ''' print('catalogue: '+cataurl) respond = requests.get(cataurl,proxies=self.proxies,headers=self.headers) soup = bs4.BeautifulSoup(respond.text,'html.parser') # create dir dirname = os.path.dirname(__file__)+'//'+soup.title.text dirname = dirname.replace('|','') if not os.path.exists(dirname): os.makedirs(dirname) # write link-url to link.txt with open(dirname+"//link.txt","wt+") as f: f.write(self.__cata_url) # save all pic for dls in soup.find_all(class_='gdtm'): # get pic-page-url picpage_url = dls.a['href'] print('picpage-url: '+picpage_url) # get pic-num-id num = picpage_url[picpage_url.rfind('-')+1:] pic_path = "%s//%s.jpg" % (dirname,num) if os.path.isfile(pic_path) == False: # download picture r = requests.get(picpage_url,proxies=self.proxies,headers=self.headers) s = bs4.BeautifulSoup(r.text,'html.parser') for dl in s.find_all('img'): if dl.get('style') is not None: print('=>'+num+'.jpg') r = requests.get(dl.get('src'),proxies=self.proxies,headers=self.headers) with open(pic_path,'wb') as f: f.write(r.content) print('download success!') else: print('the file already exists') def get_pictures(self,catalogue_url): ''' 传入第一个页面的目录URL(catalogue_url) 下载所有的绅士图片 ''' # get all cata-html self.__cata_url = catalogue_url for cataurl in self.__get_all_cataurl(catalogue_url): self.__get_all_pic(cataurl)if __name__ == '__main__': proxies = {'http':'http://127.0.0.1:1080'} #CATAURL = "http://g.e-hentai.org/g/994160/xxxxxx/" CATAURL = input("输入目录网址: "); s = Spider(proxies) s.get_pictures(CATAURL)