crawl4ai实现Bing浏览器搜索 2026年04月26日 python, aigc 预计阅读 0 分钟 ```python import asyncio import json from urllib.parse import quote from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.extraction_strategy import JsonCssExtractionStrategy # ==================== 1. CSS 提取策略 ==================== bing_schema = { "name": "Bing Search Results", "baseSelector": "li.b_algo", "fields": [ { "name": "title", "selector": "h2 a", "type": "text" }, { "name": "link", "selector": "h2 a", "type": "attribute", "attribute": "href" }, { "name": "snippet", "selector": "div.b_caption > p, div.b_caption p, div.b_snippet, p.b_algoSlug", "type": "text" } ] } extraction_strategy = JsonCssExtractionStrategy(bing_schema) # ==================== 2. 翻页搜索函数 ==================== async def search_bing(query: str, num_results: int = 10): """ 使用 crawl4ai 模拟 Bing 搜索,支持翻页获取指定条数 Args: query: 搜索关键词 num_results: 期望返回总条数 Returns: list[dict]: [{title, url, description}, ...] """ encoded_query = quote(query) per_page = 10 # Bing 每页约 10 条 all_results = [] browser_config = BrowserConfig( headless=True, verbose=False, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", viewport_width=1920, viewport_height=1080, ) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy, page_timeout=30000, wait_for="css:li.b_algo", remove_overlay_elements=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: page = 0 while len(all_results) < num_results: # Bing 分页参数: first=1, 11, 21, 31... first = page * per_page + 1 search_url = f"https://cn.bing.com/search?q={encoded_query}&first={first}" print(f"📄 正在爬取第 {page + 1} 页 (first={first})...") result = await crawler.arun(url=search_url, config=run_config) if not result.success: print(f"❌ 第 {page + 1} 页爬取失败: {result.error_message}") break try: data = json.loads(result.extracted_content) page_results = [ { "title": item["title"].strip(), "link": item["link"], "snippet": " ".join(item.get("snippet", "").split()) if item.get("snippet") else "" } for item in data if item.get("title") and item.get("link") ] if not page_results: print(f"⚠️ 第 {page + 1} 页无结果,停止翻页") break all_results.extend(page_results) print(f" ✅ 第 {page + 1} 页获取 {len(page_results)} 条,累计 {len(all_results)} 条") # 如果本页结果不足 per_page,说明已到最后一页 if len(page_results) < per_page: break except json.JSONDecodeError: print("❌ JSON 解析失败") break page += 1 # 礼貌延迟,避免触发反爬 if len(all_results) < num_results: await asyncio.sleep(1.5) return all_results[:num_results] # ==================== 3. 使用示例 ==================== async def main(): query = "Python 异步编程" num = 20 # 要获取 20 条 print(f"🔍 搜索: {query},目标 {num} 条\n") results = await search_bing(query, num_results=num) print(f"\n{'=' * 50}") print(f"✅ 最终获取 {len(results)} 条结果:\n") for idx, item in enumerate(results, 1): print(f"[{idx}] {item['title']}") print(f" URL: {item['url']}") desc = item['description'][:120] + "..." if len(item['description']) > 120 else item['description'] print(f" 简介: {desc if desc else '(无简介)'}") print() if __name__ == "__main__": asyncio.run(main()) ```
评论区