import requests
from flask import json
from requests.exceptions import RequestException
import re
from multiprocessing import Pool
'''
Request+正則表達(dá)式抓取貓眼電影
'''
'''
獲取第一頁(yè)的內(nèi)容
'''
def getOneContent(url,headers):
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
'''
解析內(nèi)容,根據(jù)正則表達(dá)式
'''
def parserContent(content):
if content:
# pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>'
# +'.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?<dd>',re.S)
# 字符串換行不需要添加“+”,上面這種寫法是錯(cuò)誤的。
pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>'
'.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?</dd>',re.S)
results = re.findall(pattern,content)
# print(results)
return results
def processData(results):
for result in results:
yield {
'index':result[0],
'imgurl':result[1],
'name':result[2],
'star':result[3].strip()[3:],
'releasetime':result[4].strip()[5:],
'score':result[5]+result[6]
}
# print(result)
def storeData(data):
'''
為了防止出現(xiàn)unicode碼
:param data: 需要寫入文本的數(shù)據(jù)
:return: 無(wú)返回值
'''
with open("mmovie.txt",'a',encoding='utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False)+'\n')
f.close()
def main(offset):
url = 'http://maoyan.com/board/4?offset='+str(offset)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'}
html = getOneContent(url,headers=headers)
# print(html)
results = parserContent(html)
for item in processData(results):
storeData(item)
if __name__ == '__main__':
# for i in range(10):
# main(i*10)
pool = Pool()
pool.map(main,[i*10 for i in range(10)])
首發(fā):傳智播客人工智能+pathon培訓(xùn)學(xué)院
作者:http://python.itcast.cn/ |
|