crawl4ai实现Bing浏览器搜索

2026年04月26日 python, aigc 预计阅读 0 分钟

```python
import asyncio
import json
from urllib.parse import quote
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

# ==================== 1. CSS 提取策略 ====================
bing_schema = {
    "name": "Bing Search Results",
    "baseSelector": "li.b_algo",
    "fields": [
        {
            "name": "title",
            "selector": "h2 a",
            "type": "text"
        },
        {
            "name": "link",
            "selector": "h2 a",
            "type": "attribute",
            "attribute": "href"
        },
        {
            "name": "snippet",
            "selector": "div.b_caption > p, div.b_caption p, div.b_snippet, p.b_algoSlug",
            "type": "text"
        }
    ]
}

extraction_strategy = JsonCssExtractionStrategy(bing_schema)

# ==================== 2. 翻页搜索函数 ====================
async def search_bing(query: str, num_results: int = 10):
    """
    使用 crawl4ai 模拟 Bing 搜索，支持翻页获取指定条数

Args:
        query: 搜索关键词
        num_results: 期望返回总条数

Returns:
        list[dict]: [{title, url, description}, ...]
    """
    encoded_query = quote(query)
    per_page = 10  # Bing 每页约 10 条
    all_results = []

browser_config = BrowserConfig(
        headless=True,
        verbose=False,
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                   "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
        viewport_width=1920,
        viewport_height=1080,
    )

run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=extraction_strategy,
        page_timeout=30000,
        wait_for="css:li.b_algo",
        remove_overlay_elements=True,
    )

async with AsyncWebCrawler(config=browser_config) as crawler:
        page = 0
        while len(all_results) < num_results:
            # Bing 分页参数: first=1, 11, 21, 31...
            first = page * per_page + 1
            search_url = f"https://cn.bing.com/search?q={encoded_query}&first={first}"

print(f"📄 正在爬取第 {page + 1} 页 (first={first})...")

result = await crawler.arun(url=search_url, config=run_config)

if not result.success:
                print(f"❌ 第 {page + 1} 页爬取失败: {result.error_message}")
                break

try:
                data = json.loads(result.extracted_content)
                page_results = [
                    {
                        "title": item["title"].strip(),
                        "link": item["link"],
                        "snippet": " ".join(item.get("snippet", "").split()) if item.get("snippet") else ""
                    }
                    for item in data
                    if item.get("title") and item.get("link")
                ]

if not page_results:
                    print(f"⚠️ 第 {page + 1} 页无结果，停止翻页")
                    break

all_results.extend(page_results)
                print(f"   ✅ 第 {page + 1} 页获取 {len(page_results)} 条，累计 {len(all_results)} 条")

# 如果本页结果不足 per_page，说明已到最后一页
                if len(page_results) < per_page:
                    break

except json.JSONDecodeError:
                print("❌ JSON 解析失败")
                break

page += 1

# 礼貌延迟，避免触发反爬
            if len(all_results) < num_results:
                await asyncio.sleep(1.5)

return all_results[:num_results]

# ==================== 3. 使用示例 ====================
async def main():
    query = "Python 异步编程"
    num = 20  # 要获取 20 条

print(f"🔍 搜索: {query}，目标 {num} 条\n")

results = await search_bing(query, num_results=num)

print(f"\n{'=' * 50}")
    print(f"✅ 最终获取 {len(results)} 条结果:\n")

for idx, item in enumerate(results, 1):
        print(f"[{idx}] {item['title']}")
        print(f"    URL: {item['url']}")
        desc = item['description'][:120] + "..." if len(item['description']) > 120 else item['description']
        print(f"    简介: {desc if desc else '（无简介）'}")
        print()

if __name__ == "__main__":
    asyncio.run(main())
```

评论区