Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import pytest | |
| import asyncio | |
| import json | |
| # Add the parent directory to the Python path | |
| parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(parent_dir) | |
| from crawl4ai.async_webcrawler import AsyncWebCrawler | |
| from crawl4ai.chunking_strategy import RegexChunking, NlpSentenceChunking | |
| from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy | |
| async def test_regex_chunking(): | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| url = "https://www.nbcnews.com/business" | |
| chunking_strategy = RegexChunking(patterns=["\n\n"]) | |
| result = await crawler.arun( | |
| url=url, | |
| chunking_strategy=chunking_strategy, | |
| bypass_cache=True | |
| ) | |
| assert result.success | |
| assert result.extracted_content | |
| chunks = json.loads(result.extracted_content) | |
| assert len(chunks) > 1 # Ensure multiple chunks were created | |
| # @pytest.mark.asyncio | |
| # async def test_cosine_strategy(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://www.nbcnews.com/business" | |
| # extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3) | |
| # result = await crawler.arun( | |
| # url=url, | |
| # extraction_strategy=extraction_strategy, | |
| # bypass_cache=True | |
| # ) | |
| # assert result.success | |
| # assert result.extracted_content | |
| # extracted_data = json.loads(result.extracted_content) | |
| # assert len(extracted_data) > 0 | |
| # assert all('tags' in item for item in extracted_data) | |
| async def test_llm_extraction_strategy(): | |
| async with AsyncWebCrawler(verbose=True) as crawler: | |
| url = "https://www.nbcnews.com/business" | |
| extraction_strategy = LLMExtractionStrategy( | |
| provider="openai/gpt-4o-mini", | |
| api_token=os.getenv('OPENAI_API_KEY'), | |
| instruction="Extract only content related to technology" | |
| ) | |
| result = await crawler.arun( | |
| url=url, | |
| extraction_strategy=extraction_strategy, | |
| bypass_cache=True | |
| ) | |
| assert result.success | |
| assert result.extracted_content | |
| extracted_data = json.loads(result.extracted_content) | |
| assert len(extracted_data) > 0 | |
| assert all('content' in item for item in extracted_data) | |
| # @pytest.mark.asyncio | |
| # async def test_combined_chunking_and_extraction(): | |
| # async with AsyncWebCrawler(verbose=True) as crawler: | |
| # url = "https://www.nbcnews.com/business" | |
| # chunking_strategy = RegexChunking(patterns=["\n\n"]) | |
| # extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3) | |
| # result = await crawler.arun( | |
| # url=url, | |
| # chunking_strategy=chunking_strategy, | |
| # extraction_strategy=extraction_strategy, | |
| # bypass_cache=True | |
| # ) | |
| # assert result.success | |
| # assert result.extracted_content | |
| # extracted_data = json.loads(result.extracted_content) | |
| # assert len(extracted_data) > 0 | |
| # assert all('tags' in item for item in extracted_data) | |
| # assert all('content' in item for item in extracted_data) | |
| # Entry point for debugging | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) |