Python中有较多的网络爬虫库:
Requests + BeautifulSoup:小型静态网站抓取。
Scrapy:大型、规则明晰的静态网站或 API 爬取。
Selenium:动态页面、需要交互的场景。也可作为 Scrapy 的下载器中间件。
每个有自己不同的适用场合。如果爬取的页面中包含登录验证,或发送请求时进行了签名验证,则无法使用前两种。Selenium是是一种模拟浏览器的自动化工具。
易车网数据,包含了签名加密,因此我们采用Selenium爬取。
现通过Trae编写Python代码实现了一个基于Selenium的爬虫,用于爬取易车网(car.yiche.com)的车辆列表和参数配置。
# -*- coding: UTF-8 -*-from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport jsonimport timeimport randomclass SeleniumCrawler:"""Selenium自动爬虫 - 爬取车辆列表和参数配置"""def __init__(self):self.driver = Noneself.all_cars = []self.failed_pages = []def setup_driver(self):"""设置浏览器"""chrome_options = Options()chrome_options.add_argument('--disable-blink-features=AutomationControlled')chrome_options.add_argument('--disable-dev-shm-usage')chrome_options.add_argument('--no-sandbox')chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])chrome_options.add_experimental_option('useAutomationExtension', False)print("启动浏览器...")self.driver = webdriver.Chrome(options=chrome_options)self.driver.implicitly_wait(10)def wait_for_manual_verification(self):"""等待手动验证"""print("\n" + "=" * 60)print("如果看到验证码,请手动完成验证")print("完成后,在浏览器中按 ENTER 继续")print("=" * 60)input()def extract_car_info(self, car_item):"""提取车辆基本信息和参数页面链接"""try:from bs4 import BeautifulSoupitem_html = str(car_item)soup = BeautifulSoup(item_html, 'html.parser')name_tag = soup.find('p', class_='cx-name')price_tag = soup.find('p', class_='cx-price')link_tag = soup.find('a', href=True)car_info = {'name': name_tag.get_text(strip=True) if name_tag else '','price': price_tag.get_text(strip=True) if price_tag else '价格未知','url': link_tag['href'] if link_tag else ''}return car_infoexcept Exception as e:print(f" [X] 提取基本信息失败: {e}")return Nonedef crawl_config_page(self, base_url):"""爬取参数配置页面"""if not base_url:return {}# 构建参数配置页面URLif base_url.startswith('//'):config_url = 'https:' + base_url.rstrip('/') + '/peizhi/'elif base_url.startswith('/'):config_url = 'https://car.yiche.com' + base_url.rstrip('/') + '/peizhi/'else:config_url = base_url.rstrip('/') + '/peizhi/'print(f" 访问参数页: {config_url}")try:self.driver.get(config_url)time.sleep(random.uniform(2, 4))from bs4 import BeautifulSoupsoup = BeautifulSoup(self.driver.page_source, 'html.parser')config = {}config_cards = soup.find_all('div', class_='config-item')if not config.get('manufacturer'):param_tables = soup.find_all('table', class_='param-table')for table in param_tables:rows = table.find_all('tr')for row in rows:cells = row.find_all('td')if len(cells) >= 2:key = cells[0].get_text(strip=True)value = cells[1].get_text(strip=True)if key == '厂商':config['manufacturer'] = valueelif key == '级别':config['level'] = valueelif key == '能源类型':config['energy_type'] = valueelif key == '上市时间':config['launch_date'] = valueelif key == '车身结构':config['body_type'] = valueelif key == '座位数':config['seats'] = valueprint(f" 配置信息: 厂商={config.get('manufacturer','-')}, 级别={config.get('level','-')}, 能源={config.get('energy_type','-')}")return configexcept Exception as e:print(f" [X] 获取配置失败: {e}")return {}def parse_current_page(self):"""解析当前页面并获取车辆列表"""try:from bs4 import BeautifulSoupsoup = BeautifulSoup(self.driver.page_source, 'html.parser')car_items = soup.find_all('div', class_='search-result-list-item')cars = []for idx, item in enumerate(car_items, 1):car_info = self.extract_car_info(item)if car_info and car_info['name']:print(f" [{idx}] {car_info['name']}")# 进入参数配置页面config = self.crawl_config_page(car_info['url'])car_info.update(config)cars.append(car_info)return carsexcept Exception as e:print(f" 解析错误: {e}")return []def crawl_page(self, page_num):"""爬取单页列表"""url = f'https://car.yiche.com/xuanchegongju/?f=16&page={page_num}'print(f"\n 正在访问: {url}")try:self.driver.get(url)time.sleep(random.uniform(2, 4))# 检查是否被拦截page_source = self.driver.page_sourceif len(page_source) < 5000 or 'captcha' in page_source.lower():print(f" 检测到验证码,请手动验证...")self.wait_for_manual_verification()time.sleep(2)cars = self.parse_current_page()if cars:print(f" ✓ 本页找到 {len(cars)} 个车辆")return carselse:print(f" ✗ 未找到车辆数据")return []except Exception as e:print(f" ✗ 错误: {e}")return []def save_progress(self, filename='selenium_progress.json'):"""保存进度"""data = {'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),'total_cars': len(self.all_cars),'failed_pages': self.failed_pages,'cars': self.all_cars}with open(filename, 'w', encoding='utf-8') as f:json.dump(data, f, ensure_ascii=False, indent=2)print(f"\n [已保存] {filename}")def crawl(self, start_page=1, end_page=20):"""爬取多页"""print("=" * 60)print("Selenium爬虫 - 爬取车辆列表和参数配置")print(f"目标: 第 {start_page} - {end_page} 页")print("=" * 60)try:self.setup_driver()# 首先访问首页处理可能的验证码print("\n首先访问首页处理验证码...")self.driver.get('https://car.yiche.com/xuanchegongju/?f=16')time.sleep(3)if 'captcha' in self.driver.page_source.lower():print("首页需要验证,请手动完成...")self.wait_for_manual_verification()total_pages = end_page - start_page + 1for page_num in range(start_page, end_page + 1):current = page_num - start_page + 1print(f"\n[{current}/{total_pages}] 爬取第 {page_num} 页...")cars = self.crawl_page(page_num)if cars:self.all_cars.extend(cars)else:self.failed_pages.append(page_num)# 每5页保存一次if page_num % 5 == 0:self.save_progress()# 页面之间延迟if page_num < end_page:delay = random.uniform(5, 10)print(f" 等待 {delay:.1f} 秒...")time.sleep(delay)# 最终保存self.save_progress('final_cars_with_config.json')print("\n" + "=" * 60)print("爬取完成!")print(f"总计: {len(self.all_cars)} 辆车")print(f"失败: {len(self.failed_pages)} 页")if self.failed_pages:print(f"失败页面: {self.failed_pages}")print("=" * 60)# 打印摘要self.print_summary()except KeyboardInterrupt:print("\n\n用户中断,正在保存当前进度...")self.save_progress('interrupted_progress.json')print(f"已保存 {len(self.all_cars)} 个车辆")finally:if self.driver:input("\n按 ENTER 关闭浏览器...")self.driver.quit()def print_summary(self):"""打印摘要"""print("\n参数配置摘要:")print("-" * 80)print(f"{'名称':<15}{'价格':<12}{'厂商':<12}{'级别':<8}{'能源类型':<10}{'上市时间':<10}")print("-" * 80)for car in self.all_cars[:10]:print(f"{car['name'][:13]:<15}{car['price'][:10]:<12}{car.get('manufacturer','')[:10]:<12}{car.get('level','')[:6]:<8}{car.get('energy_type','')[:8]:<10}{car.get('launch_date','')[:8]:<10}")if len(self.all_cars) > 10:print(f"\n... 还有 {len(self.all_cars) - 10} 辆车")if __name__ == '__main__':crawler = SeleniumCrawler()crawler.crawl(start_page=1, end_page=20)
阅读此AI代码
以“入口 → 主要流程 → 关键方法”流程去读
①主程序入口:从main开始
创建 SeleniumCrawler 类的实例。(不需要进入)
调用 crawl 方法,传入起始页和结束页(1 到 20 页),开始爬取。(进入crawl)
②主控制流程:crawl 方法(摁住ctrl,点击crawler.crawl)
self.setup_driver() 启动 Chrome
预处理验证码
先访问选车工具首页 https://car.yiche.com/xuanchegongju/?f=16,如果发现页面包含 captcha,则调用 wait_for_manual_verification() 暂停,等待用户手动完成验证。这是为了避免每页都弹出验证码。
循环爬取每一页
调用 self.crawl_page(page_num) 获取该页的车辆数据。(等会点进去看)
每爬完 5 页自动保存一次中间结果(save_progress())。
页与页之间随机等待 5~10 秒,降低请求频率。
最终保存与报告
循环结束后调用 save_progress('final_cars_with_config.json') 保存全部数据
③单页爬取:crawl_page 方法(摁住ctrl,点击cars = self.crawl_page(page_num))
构造 URL:https://car.yiche.com/xuanchegongju/?f=16&page={page_num}。
用浏览器访问该 URL,随机睡眠 2~4 秒。
调用 self.parse_current_page() 解析当前页面,返回该页所有车辆信息(含每辆车的参数配置)。(等会点进去看)
返回车辆列表,若出错则返回空列表。
④页面解析:parse_current_page 方法
从当前浏览器页面中提取所有车辆的基本信息 + 参数配置:
获取页面源码,用 BeautifulSoup 解析。
找到所有 class="search-result-list-item" 的 div,每个代表一辆车。
对每个车辆元素调用 self.extract_car_info(item) 得到:
如果 name 非空,再调用 self.crawl_config_page(car_info['url']) 获取该车的详细参数(厂商、级别、能源类型等)。(等会点进去看)
将配置字典合并到 car_info 中,加入 cars 列表。
最终返回 cars。
⑤爬取车辆参数配置页:crawl_config_page 方法
访问配置页,随机等待 2~4 秒。
用 BeautifulSoup 解析页面
这就是从 main 开始,按执行流程解读的核心内容。
夜雨聆风