Spaces:

JimLin0704
/

Crawl4AI

Sleeping

Crawl4AI / tests /async /test_content_scraper_strategy.py

amaye15

test

03c0888 11 months ago

6.92 kB

	import asyncio
	from bs4 import BeautifulSoup
	from typing import Dict, Any
	import os
	import sys
	import time
	import csv
	from tabulate import tabulate
	from dataclasses import dataclass
	from typing import List, Dict

	parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	sys.path.append(parent_dir)
	__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

	from crawl4ai.content_scraping_strategy import WebScrapingStrategy
	from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
	# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent

	@dataclass
	class TestResult:
	name: str
	success: bool
	images: int
	internal_links: int
	external_links: int
	markdown_length: int
	execution_time: float

	class StrategyTester:
	def __init__(self):
	self.new_scraper = WebScrapingStrategy()
	self.current_scraper = WebScrapingStrategyCurrent()
	with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
	self.WIKI_HTML = f.read()
	self.results = {'new': [], 'current': []}

	def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
	results = []
	for scraper in [self.new_scraper, self.current_scraper]:
	start_time = time.time()
	result = scraper._get_content_of_website_optimized(
	url="https://en.wikipedia.org/wiki/Test",
	html=self.WIKI_HTML,
	**kwargs
	)
	execution_time = time.time() - start_time

	test_result = TestResult(
	name=name,
	success=result['success'],
	images=len(result['media']['images']),
	internal_links=len(result['links']['internal']),
	external_links=len(result['links']['external']),
	markdown_length=len(result['markdown']),
	execution_time=execution_time
	)
	results.append(test_result)

	return results[0], results[1] # new, current

	def run_all_tests(self):
	test_cases = [
	("Basic Extraction", {}),
	("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
	("Word Threshold", {'word_count_threshold': 50}),
	("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
	("Link Exclusions", {
	'exclude_external_links': True,
	'exclude_social_media_links': True,
	'exclude_domains': ['facebook.com', 'twitter.com']
	}),
	("Media Handling", {
	'exclude_external_images': True,
	'image_description_min_word_threshold': 20
	}),
	("Text Only", {
	'only_text': True,
	'remove_forms': True
	}),
	("HTML Cleaning", {
	'clean_html': True,
	'keep_data_attributes': True
	}),
	("HTML2Text Options", {
	'html2text': {
	'skip_internal_links': True,
	'single_line_break': True,
	'mark_code': True,
	'preserve_tags': ['pre', 'code']
	}
	})
	]

	all_results = []
	for name, kwargs in test_cases:
	try:
	new_result, current_result = self.run_test(name, **kwargs)
	all_results.append((name, new_result, current_result))
	except Exception as e:
	print(f"Error in {name}: {str(e)}")

	self.save_results_to_csv(all_results)
	self.print_comparison_table(all_results)

	def save_results_to_csv(self, all_results: List[tuple]):
	csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
	with open(csv_file, 'w', newline='') as f:
	writer = csv.writer(f)
	writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
	'External Links', 'Markdown Length', 'Execution Time'])

	for name, new_result, current_result in all_results:
	writer.writerow([name, 'New', new_result.success, new_result.images,
	new_result.internal_links, new_result.external_links,
	new_result.markdown_length, f"{new_result.execution_time:.3f}"])
	writer.writerow([name, 'Current', current_result.success, current_result.images,
	current_result.internal_links, current_result.external_links,
	current_result.markdown_length, f"{current_result.execution_time:.3f}"])

	def print_comparison_table(self, all_results: List[tuple]):
	table_data = []
	headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
	'External Links', 'Markdown Length', 'Time (s)']

	for name, new_result, current_result in all_results:
	# Check for differences
	differences = []
	if new_result.images != current_result.images: differences.append('images')
	if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
	if new_result.external_links != current_result.external_links: differences.append('external_links')
	if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')

	# Add row for new strategy
	new_row = [
	name, 'New', new_result.success, new_result.images,
	new_result.internal_links, new_result.external_links,
	new_result.markdown_length, f"{new_result.execution_time:.3f}"
	]
	table_data.append(new_row)

	# Add row for current strategy
	current_row = [
	'', 'Current', current_result.success, current_result.images,
	current_result.internal_links, current_result.external_links,
	current_result.markdown_length, f"{current_result.execution_time:.3f}"
	]
	table_data.append(current_row)

	# Add difference summary if any
	if differences:
	table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])

	# Add empty row for better readability
	table_data.append([''] * len(headers))

	print("\nStrategy Comparison Results:")
	print(tabulate(table_data, headers=headers, tablefmt='grid'))

	if __name__ == "__main__":
	tester = StrategyTester()
	tester.run_all_tests()