Show HN:Crawlee for Python – 一个网页抓取和浏览器自动化库
Show HN: Crawlee for Python – a web scraping and browser automation library
原始链接: https://crawlee.dev/python/
import asynciofrom crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContextasync def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=5, headless=False, browser_type='firefox', ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() data = { 'url': context.request.url, 'title': await context.page.title(),'内容':(等待context.page.content())[:100],}等待context.push_data(数据)等待crawler.run(['https://crawlee.dev'] )等待crawler.export_data('results.json') data =等待crawler.get_data()crawler._logger.info(f'提取的数据:{data.items}')if __name__ == '__main__': asyncio.run(主要的())