静态爬虫: 封装header, 正则表达式取值。
九游渠道热门游戏榜单爬取
import requests
import re
from bs4 import BeautifulSoup
class JiuGamesSpider(object):
def __init__(self, url):
self.web = "http://www.9game.cn"
self.url = url
self.header = {
"Content-Type": "text/html; charset=utf-8",
"Connection": "keep-Alive",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/77.0.3865.120 Safari/537.36',
}
def run_spider(self):
""" 获取九游新游热榜的排行榜 """
web_html = requests.get(self.url, headers=self.header).text
soup = BeautifulSoup(web_html, 'html.parser')
content = soup.select('div[class="box-text"]')[0] # 九游排行榜table内容
games_table = content.find_all(['tr'])
r1 = r'n">(.*?)</'
r2 = r'type">(.*?)</'
r3 = r'title="(.*?)"'
r4 = r'href="(.*?)"'
r5 = r'p">(.*?)</'
r6 = r'">(.*?)</'
r7 = r'<div class="[a-z]">(.*?)</'
games_result = []
for game in games_table[1:]:
a = dict()
a["GameRank"] = re.findall(r1, str(game.select('span[class="n"]')[0]))[0]
a["GameName"] = re.findall(r3, str(game.select('td[class="name"]')[0]))[0]
a["GameType"] = re.findall(r2, str(game.select('td[class="type"]')[0]))[0]
a["link"] = self.web + re.findall(r4, str(game.select('td[class="name"]')[0]))[0] # 平台游戏,可根基link,垂直爬取数据信息
try:
a["GameStatus"] = re.findall(r5, str(game.select('td[class="static"]')[0]))[0]
except IndexError as e:
# print(game.select('td[class="time time1"]'))
temp = re.findall(r7, str(game.select('td[class="time time1"]')[0]))
a["GameStatus"] = temp[1]
a["GameHot"] = 0 # re.findall(r6, str(game.select('td[class="hottr hot"] span')[0]))[0] # 无 hot
games_result.append(a)
print(games_result)
return games_result
if __name__ == '__main__':
url = "http://www.9game.cn/xyrb/"
JiuGamesSpider(url).run_spider()
print("此次9Games spider end!")
运行结果: