爬取易车网数据-AI+Python②

Python中有较多的网络爬虫库：

Requests + BeautifulSoup：小型静态网站抓取。

Scrapy：大型、规则明晰的静态网站或 API 爬取。

Selenium：动态页面、需要交互的场景。也可作为 Scrapy 的下载器中间件。

每个有自己不同的适用场合。如果爬取的页面中包含登录验证，或发送请求时进行了签名验证，则无法使用前两种。Selenium是是一种模拟浏览器的自动化工具。

易车网数据，包含了签名加密，因此我们采用Selenium爬取。

现通过Trae编写Python代码实现了一个基于Selenium的爬虫，用于爬取易车网（car.yiche.com）的车辆列表和参数配置。

# -*- coding: UTF-8 -*-from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport jsonimport timeimport randomclass SeleniumCrawler:    """Selenium自动爬虫 - 爬取车辆列表和参数配置"""    def __init__(self):        self.driver = None        self.all_cars = []        self.failed_pages = []    def setup_driver(self):        """设置浏览器"""        chrome_options = Options()        chrome_options.add_argument('--disable-blink-features=AutomationControlled')        chrome_options.add_argument('--disable-dev-shm-usage')        chrome_options.add_argument('--no-sandbox')        chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])        chrome_options.add_experimental_option('useAutomationExtension', False)        print("启动浏览器...")        self.driver = webdriver.Chrome(options=chrome_options)        self.driver.implicitly_wait(10)    def wait_for_manual_verification(self):        """等待手动验证"""        print("\n" + "=" * 60)        print("如果看到验证码，请手动完成验证")        print("完成后，在浏览器中按 ENTER 继续")        print("=" * 60)        input()    def extract_car_info(self, car_item):        """提取车辆基本信息和参数页面链接"""        try:            from bs4 import BeautifulSoup            item_html = str(car_item)            soup = BeautifulSoup(item_html, 'html.parser')            name_tag = soup.find('p', class_='cx-name')            price_tag = soup.find('p', class_='cx-price')            link_tag = soup.find('a', href=True)            car_info = {                'name': name_tag.get_text(strip=True) if name_tag else '',                'price': price_tag.get_text(strip=True) if price_tag else '价格未知',                'url': link_tag['href'] if link_tag else ''            }            return car_info        except Exception as e:            print(f"    [X] 提取基本信息失败: {e}")            return None    def crawl_config_page(self, base_url):        """爬取参数配置页面"""        if not base_url:            return {}        # 构建参数配置页面URL        if base_url.startswith('//'):            config_url = 'https:' + base_url.rstrip('/') + '/peizhi/'        elif base_url.startswith('/'):            config_url = 'https://car.yiche.com' + base_url.rstrip('/') + '/peizhi/'        else:            config_url = base_url.rstrip('/') + '/peizhi/'        print(f"      访问参数页: {config_url}")        try:            self.driver.get(config_url)            time.sleep(random.uniform(2, 4))            from bs4 import BeautifulSoup            soup = BeautifulSoup(self.driver.page_source, 'html.parser')            config = {}            config_cards = soup.find_all('div', class_='config-item')                        if not config.get('manufacturer'):                param_tables = soup.find_all('table', class_='param-table')                for table in param_tables:                    rows = table.find_all('tr')                    for row in rows:                        cells = row.find_all('td')                        if len(cells) >= 2:                            key = cells[0].get_text(strip=True)                            value = cells[1].get_text(strip=True)                            if key == '厂商':                                config['manufacturer'] = value                            elif key == '级别':                                config['level'] = value                            elif key == '能源类型':                                config['energy_type'] = value                            elif key == '上市时间':                                config['launch_date'] = value                            elif key == '车身结构':                                config['body_type'] = value                            elif key == '座位数':                                config['seats'] = value            print(f"      配置信息: 厂商={config.get('manufacturer','-')}, 级别={config.get('level','-')}, 能源={config.get('energy_type','-')}")            return config        except Exception as e:            print(f"      [X] 获取配置失败: {e}")            return {}    def parse_current_page(self):        """解析当前页面并获取车辆列表"""        try:            from bs4 import BeautifulSoup            soup = BeautifulSoup(self.driver.page_source, 'html.parser')            car_items = soup.find_all('div', class_='search-result-list-item')            cars = []            for idx, item in enumerate(car_items, 1):                car_info = self.extract_car_info(item)                if car_info and car_info['name']:                    print(f"      [{idx}] {car_info['name']}")                    # 进入参数配置页面                    config = self.crawl_config_page(car_info['url'])                    car_info.update(config)                    cars.append(car_info)            return cars        except Exception as e:            print(f"    解析错误: {e}")            return []    def crawl_page(self, page_num):        """爬取单页列表"""        url = f'https://car.yiche.com/xuanchegongju/?f=16&page={page_num}'        print(f"\n  正在访问: {url}")        try:            self.driver.get(url)            time.sleep(random.uniform(2, 4))            # 检查是否被拦截            page_source = self.driver.page_source            if len(page_source) < 5000 or 'captcha' in page_source.lower():                print(f"    检测到验证码，请手动验证...")                self.wait_for_manual_verification()                time.sleep(2)            cars = self.parse_current_page()            if cars:                print(f"    ✓ 本页找到 {len(cars)} 个车辆")                return cars            else:                print(f"    ✗ 未找到车辆数据")                return []        except Exception as e:            print(f"    ✗ 错误: {e}")            return []    def save_progress(self, filename='selenium_progress.json'):        """保存进度"""        data = {            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),            'total_cars': len(self.all_cars),            'failed_pages': self.failed_pages,            'cars': self.all_cars        }        with open(filename, 'w', encoding='utf-8') as f:            json.dump(data, f, ensure_ascii=False, indent=2)        print(f"\n  [已保存] {filename}")    def crawl(self, start_page=1, end_page=20):        """爬取多页"""        print("=" * 60)        print("Selenium爬虫 - 爬取车辆列表和参数配置")        print(f"目标: 第 {start_page} - {end_page} 页")        print("=" * 60)        try:            self.setup_driver()            # 首先访问首页处理可能的验证码            print("\n首先访问首页处理验证码...")            self.driver.get('https://car.yiche.com/xuanchegongju/?f=16')            time.sleep(3)            if 'captcha' in self.driver.page_source.lower():                print("首页需要验证，请手动完成...")                self.wait_for_manual_verification()            total_pages = end_page - start_page + 1            for page_num in range(start_page, end_page + 1):                current = page_num - start_page + 1                print(f"\n[{current}/{total_pages}] 爬取第 {page_num} 页...")                cars = self.crawl_page(page_num)                if cars:                    self.all_cars.extend(cars)                else:                    self.failed_pages.append(page_num)                # 每5页保存一次                if page_num % 5 == 0:                    self.save_progress()                # 页面之间延迟                if page_num < end_page:                    delay = random.uniform(5, 10)                    print(f"  等待 {delay:.1f} 秒...")                    time.sleep(delay)            # 最终保存            self.save_progress('final_cars_with_config.json')            print("\n" + "=" * 60)            print("爬取完成！")            print(f"总计: {len(self.all_cars)} 辆车")            print(f"失败: {len(self.failed_pages)} 页")            if self.failed_pages:                print(f"失败页面: {self.failed_pages}")            print("=" * 60)            # 打印摘要            self.print_summary()        except KeyboardInterrupt:            print("\n\n用户中断，正在保存当前进度...")            self.save_progress('interrupted_progress.json')            print(f"已保存 {len(self.all_cars)} 个车辆")        finally:            if self.driver:                input("\n按 ENTER 关闭浏览器...")                self.driver.quit()    def print_summary(self):        """打印摘要"""        print("\n参数配置摘要:")        print("-" * 80)        print(f"{'名称':<15}{'价格':<12}{'厂商':<12}{'级别':<8}{'能源类型':<10}{'上市时间':<10}")        print("-" * 80)        for car in self.all_cars[:10]:            print(f"{car['name'][:13]:<15}{car['price'][:10]:<12}{car.get('manufacturer','')[:10]:<12}{car.get('level','')[:6]:<8}{car.get('energy_type','')[:8]:<10}{car.get('launch_date','')[:8]:<10}")        if len(self.all_cars) > 10:            print(f"\n... 还有 {len(self.all_cars) - 10} 辆车")if __name__ == '__main__':    crawler = SeleniumCrawler()    crawler.crawl(start_page=1, end_page=20)

阅读此AI代码

以“入口 → 主要流程 → 关键方法”流程去读

①主程序入口：从main开始

创建 SeleniumCrawler 类的实例。（不需要进入）
调用 crawl 方法，传入起始页和结束页（1 到 20 页），开始爬取。（进入crawl）

②主控制流程：crawl 方法（摁住ctrl，点击crawler.crawl）

self.setup_driver() 启动 Chrome
预处理验证码

先访问选车工具首页 https://car.yiche.com/xuanchegongju/?f=16，如果发现页面包含 captcha，则调用 wait_for_manual_verification() 暂停，等待用户手动完成验证。这是为了避免每页都弹出验证码。

循环爬取每一页
调用 self.crawl_page(page_num) 获取该页的车辆数据。(等会点进去看)
每爬完 5 页自动保存一次中间结果（save_progress()）。
页与页之间随机等待 5~10 秒，降低请求频率。
最终保存与报告

循环结束后调用 save_progress('final_cars_with_config.json') 保存全部数据

③单页爬取：crawl_page 方法（摁住ctrl,点击cars = self.crawl_page(page_num)）

构造 URL：https://car.yiche.com/xuanchegongju/?f=16&page={page_num}。

用浏览器访问该 URL，随机睡眠 2~4 秒。
调用 self.parse_current_page() 解析当前页面，返回该页所有车辆信息（含每辆车的参数配置）。(等会点进去看)
返回车辆列表，若出错则返回空列表。

④页面解析：parse_current_page 方法

从当前浏览器页面中提取所有车辆的基本信息 + 参数配置：
获取页面源码，用 BeautifulSoup 解析。

找到所有 class="search-result-list-item" 的 div，每个代表一辆车。

对每个车辆元素调用 self.extract_car_info(item) 得到：

如果 name 非空，再调用 self.crawl_config_page(car_info['url']) 获取该车的详细参数（厂商、级别、能源类型等）。(等会点进去看)

将配置字典合并到 car_info 中，加入 cars 列表。

最终返回 cars。

⑤爬取车辆参数配置页：crawl_config_page 方法

访问配置页，随机等待 2~4 秒。
用 BeautifulSoup 解析页面

这就是从 main 开始，按执行流程解读的核心内容。