九游渠道游戏排行榜爬虫
timey小先生 2021-11-03 00:11:22阅读 118 技术、静态爬虫、正则表达式

静态爬虫: 封装header, 正则表达式取值。

九游渠道热门游戏榜单爬取

import requests
import re
from bs4 import BeautifulSoup


class JiuGamesSpider(object):
    def __init__(self,  url):
        self.web = "http://www.9game.cn"
        self.url = url
        self.header = {
            "Content-Type": "text/html; charset=utf-8",
            "Connection": "keep-Alive",
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/77.0.3865.120 Safari/537.36',
        }

    def run_spider(self):
        """ 获取九游新游热榜的排行榜 """
        web_html = requests.get(self.url, headers=self.header).text
        soup = BeautifulSoup(web_html, 'html.parser')
        content = soup.select('div[class="box-text"]')[0]  # 九游排行榜table内容
        games_table = content.find_all(['tr'])
        r1 = r'n">(.*?)</'
        r2 = r'type">(.*?)</'
        r3 = r'title="(.*?)"'
        r4 = r'href="(.*?)"'
        r5 = r'p">(.*?)</'
        r6 = r'">(.*?)</'
        r7 = r'<div class="[a-z]">(.*?)</'
        games_result = []
        for game in games_table[1:]:
            a = dict()
            a["GameRank"] = re.findall(r1, str(game.select('span[class="n"]')[0]))[0]
            a["GameName"] = re.findall(r3, str(game.select('td[class="name"]')[0]))[0]
            a["GameType"] = re.findall(r2, str(game.select('td[class="type"]')[0]))[0]
            a["link"] = self.web + re.findall(r4, str(game.select('td[class="name"]')[0]))[0]    # 平台游戏,可根基link,垂直爬取数据信息
            try:
                a["GameStatus"] = re.findall(r5, str(game.select('td[class="static"]')[0]))[0]
            except IndexError as e:
                # print(game.select('td[class="time time1"]'))
                temp = re.findall(r7, str(game.select('td[class="time time1"]')[0]))
                a["GameStatus"] = temp[1]
            a["GameHot"] = 0   # re.findall(r6, str(game.select('td[class="hottr hot"] span')[0]))[0]  # 无 hot
            games_result.append(a)
        print(games_result)
        return games_result


if __name__ == '__main__':
    url = "http://www.9game.cn/xyrb/"
    JiuGamesSpider(url).run_spider()
    print("此次9Games spider end!")

 

运行结果: