Show HN:Crawlee for Python – 一个网页抓取和浏览器自动化库
Show HN: Crawlee for Python – a web scraping and browser automation library

原始链接: https://crawlee.dev/python/

import asynciofrom crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContextasync def main() -> None: crawler = PlaywrightCrawler( max_requests_per_crawl=5, headless=False, browser_type='firefox', ) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') await context.enqueue_links() data = { 'url': context.request.url, 'title': await context.page.title(),'内容':(等待context.page.content())[:100],}等待context.push_data(数据)等待crawler.run(['https://crawlee.dev'] )等待crawler.export_data('results.json') data =等待crawler.get_data()crawler._logger.info(f'提取的数据:{data.items}')if __name__ == '__main__': asyncio.run(主要的())

大家好,我是 Jan,Apify (https://apify.com/) 的创始人 - 全栈网络抓取平台。 在 Crawlee for JavaScript (https://github.com/apify/crawlee/) 的成功以及 Python 社区的需求之后,我们今天推出 Crawlee for Python!主要特点是:- 统一的编程接口 HTTP (HTTPX with BeautifulSoup) & headless browser crawling (Playwright)- Automatic parallel crawling based on available system resources- Written in Python with type hints for enhanced developer experience- Automatic retries on errors or when you’re getting blocked- Integrated proxy rotation and 会话管理 - 可配置的请求路由 - 将 URL 定向到适当的处理程序 - 要抓取的 URL 的持久队列 - 用于表格数据和文件的可插拔存储有关详细信息,您可以阅读公告博客文章:https://crawlee.dev/blog/launching -crawlee-python我们的团队和我很乐意在这里回答您可能有的任何问题。
相关文章

原文
import asyncio

from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=5,
headless=False,
browser_type='firefox',
)


@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')


await context.enqueue_links()


data = {
'url': context.request.url,
'title': await context.page.title(),
'content': (await context.page.content())[:100],
}


await context.push_data(data)


await crawler.run(['https://crawlee.dev'])


await crawler.export_data('results.json')


data = await crawler.get_data()
crawler._logger.info(f'Extracted data: {data.items}')


if __name__ == '__main__':
asyncio.run(main())
联系我们 contact @ memedata.com