Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / tests /test_web_crawler.py

amaye15

test

03c0888 10 months ago

5.08 kB

	import unittest, os
	from crawl4ai.web_crawler import WebCrawler
	from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
	from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy

	class TestWebCrawler(unittest.TestCase):

	def setUp(self):
	self.crawler = WebCrawler()

	def test_warmup(self):
	self.crawler.warmup()
	self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")

	def test_run_default_strategies(self):
	result = self.crawler.run(
	url='https://www.nbcnews.com/business',
	word_count_threshold=5,
	chunking_strategy=RegexChunking(),
	extraction_strategy=CosineStrategy(), bypass_cache=True
	)
	self.assertTrue(result.success, "Failed to crawl and extract using default strategies")

	def test_run_different_strategies(self):
	url = 'https://www.nbcnews.com/business'

	# Test with FixedLengthWordChunking and LLMExtractionStrategy
	result = self.crawler.run(
	url=url,
	word_count_threshold=5,
	chunking_strategy=FixedLengthWordChunking(chunk_size=100),
	extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
	)
	self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")

	# Test with SlidingWindowChunking and TopicExtractionStrategy
	result = self.crawler.run(
	url=url,
	word_count_threshold=5,
	chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
	extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
	)
	self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")

	def test_invalid_url(self):
	with self.assertRaises(Exception) as context:
	self.crawler.run(url='invalid_url', bypass_cache=True)
	self.assertIn("Invalid URL", str(context.exception))

	def test_unsupported_extraction_strategy(self):
	with self.assertRaises(Exception) as context:
	self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
	self.assertIn("Unsupported extraction strategy", str(context.exception))

	def test_invalid_css_selector(self):
	with self.assertRaises(ValueError) as context:
	self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
	self.assertIn("Invalid CSS selector", str(context.exception))


	def test_crawl_with_cache_and_bypass_cache(self):
	url = 'https://www.nbcnews.com/business'

	# First crawl with cache enabled
	result = self.crawler.run(url=url, bypass_cache=False)
	self.assertTrue(result.success, "Failed to crawl and cache the result")

	# Second crawl with bypass_cache=True
	result = self.crawler.run(url=url, bypass_cache=True)
	self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")

	def test_fetch_multiple_pages(self):
	urls = [
	'https://www.nbcnews.com/business',
	'https://www.bbc.com/news'
	]
	results = []
	for url in urls:
	result = self.crawler.run(
	url=url,
	word_count_threshold=5,
	chunking_strategy=RegexChunking(),
	extraction_strategy=CosineStrategy(),
	bypass_cache=True
	)
	results.append(result)

	self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
	for result in results:
	self.assertTrue(result.success, "Failed to crawl and extract a page in the list")

	def test_run_fixed_length_word_chunking_and_no_extraction(self):
	result = self.crawler.run(
	url='https://www.nbcnews.com/business',
	word_count_threshold=5,
	chunking_strategy=FixedLengthWordChunking(chunk_size=100),
	extraction_strategy=NoExtractionStrategy(), bypass_cache=True
	)
	self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")

	def test_run_sliding_window_and_no_extraction(self):
	result = self.crawler.run(
	url='https://www.nbcnews.com/business',
	word_count_threshold=5,
	chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
	extraction_strategy=NoExtractionStrategy(), bypass_cache=True
	)
	self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")

	if __name__ == '__main__':
	unittest.main()