-
#!/usr/bin/env python
-
# coding=utf-8
-
import os
-
import time
-
import threading
-
from multiprocessing import Pool, cpu_count
-
-
import requests
-
from bs4 import BeautifulSoup
-
-
essay-headers = {
-
'X-Requested-With': 'XMLHttpRequest',
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
-
'Chrome/56.0.2924.87 Safari/537.36',
-
'Referer': "http://www.mmjpg.com"
-
}
-
dir_path = r"E:\mmjpg" # 下載圖片儲存路徑
-
def save_pic(pic_src, pic_cnt):
-
""" 將圖片下載到本地檔案夾 """
-
try:
-
img = requests.get(pic_src, essay-headers=essay-headers, timeout=10)
-
imgname = "pic_cnt_{}.jpg".format(pic_cnt + 1)
-
with open(imgname, 'ab') as f:
-
f.write(img.content)
-
print(imgname)
-
except Exception as e:
-
print(e)
-
def make_dir(folder_name):
-
""" 新建套圖檔案夾並切換到該目錄下 """
-
path = os.path.join(dir_path, folder_name)
-
# 如果目錄已經存在就不用再次爬取了,去重,提高效率。存在傳回 False,否則反之
-
if not os.path.exists(path):
-
os.makedirs(path)
-
print(path)
-
os.chdir(path)
-
return True
-
print("Folder has existed!")
-
return False
-
def delete_empty_dir(dir):
-
""" 如果程式半路中斷的話,可能存在已經新建好檔案夾但是仍沒有下載的圖片的情況
-
但此時檔案夾已經存在所以會忽略該套圖的下載,此時要刪除空檔案夾 """
-
if os.path.exists(dir):
-
if os.path.isdir(dir):
-
for d in os.listdir(dir):
-
path = os.path.join(dir, d) # 組裝下一級地址
-
if os.path.isdir(path):
-
delete_empty_dir(path) # 遞迴刪除空檔案夾
-
if not os.listdir(dir):
-
os.rmdir(dir)
-
print("remove the empty dir: {}".format(dir))
-
else:
-
print("Please start your performance!") # 請開始你的表演
-
-
lock = threading.Lock() # 全域性資源鎖
-
def urls_crawler(url):
-
""" 爬蟲入口,主要爬取操作 """
-
try:
-
r = requests.get(url, essay-headers=essay-headers, timeout=10).text
-
# 套圖名,也作為檔案夾名
-
folder_name = BeautifulSoup(r, 'lxml').find('h2').text.encode('ISO-8859-1').decode('utf-8')
-
with lock:
-
if make_dir(folder_name):
-
# 套圖張數
-
max_count = BeautifulSoup(r, 'lxml').find('div', class_='page').find_all('a')[-2].get_text()
-
# 套圖頁面
-
page_urls = [url + "/" + str(i) for i in range(1, int(max_count) + 1)]
-
# 圖片地址
-
img_urls = []
-
for index, page_url in enumerate(page_urls):
-
result = requests.get(page_url, essay-headers=essay-headers, timeout=10).text
-
# 最後一張圖片沒有a標簽直接就是img所以分開解析
-
if index + 1 < len(page_urls):
-
img_url = BeautifulSoup(result, 'lxml').find('div', class_='content').find('a').img['src']
-
img_urls.append(img_url)
-
else:
-
img_url = BeautifulSoup(result, 'lxml').find('div', class_='content').find('img')['src']
-
img_urls.append(img_url)
-
-
for cnt, url in enumerate(img_urls):
-
save_pic(url, cnt)
-
except Exception as e:
-
print(e)
-
if __name__ == "__main__":
-
urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) for cnt in range(1, 953)]
-
pool = Pool(processes=cpu_count())
-
try:
-
delete_empty_dir(dir_path)
-
pool.map(urls_crawler, urls)
-
except Exception as e:
-
time.sleep(30)
-
delete_empty_dir(dir_path)
-
pool.map(urls_crawler, urls)
-
import urllib.request
-
import re
-
# 1 獲取主頁原始碼
-
# 2 獲取章節超連結
-
# 3 獲取章節超連結原始碼
-
# 4 獲取小說內容
-
# 5 下載,檔案操作
-
-
# 駝峰命名法
-
# 獲取小說內容
-
def getNovertContent():
-
#
-
html = urllib.request.urlopen("http://www.quanshuwang.com/book/0/269").read()
-
html = html.decode("gbk")
-
# 不加括號 不匹配
-
# 正則運算式 .*? 匹配所有
-
reg = r'
- (.*?)
-
# 增加效率的
-
reg = re.compile(reg)
-
urls = re.findall(reg,html)
-
# print(urls)
-
# 串列
-
# [(http://www.quanshuwang.com/book/0/269/78850.html,第一章 山邊小村),
-
# (http://www.quanshuwang.com/book/0/269/78854.html,第二章 青牛鎮)]
-
for url in urls:
-
# 章節的URL地址
-
novel_url = url[0]
-
# 章節標題
-
novel_title = url[1]
-
-
chapt = urllib.request.urlopen(novel_url).read()
-
chapt_html = chapt.decode("gbk")
-
# r 表示原生字串 \ \\d r"\d"
-
reg = r'
'
(.*?)