Spaces:

parsi-ai-nlpclass
/

Legal_RAG

Sleeping

App Files Files Community

Legal_RAG / model /processor /case_crawler.py

mojtabaa4

add application files

bc68b0b about 1 year ago

raw

history blame

5.5 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	import warnings
	from tqdm import tqdm

	class Crawler:
	# This is used for vote separating when list of vote concatenation in string
	vote_splitter = " \|split\| "

	def __init__(self, base_url: str, list_url:str ,
	base_vote_url:str , models_path: str , result_path:str):
	if base_url == "":
	self.base_url ="https://ara.jri.ac.ir/"
	else:
	self.base_url = base_url

	if list_url == "":
	self.list_url ="https://ara.jri.ac.ir/Judge/Index"
	else:
	self.list_url = list_url

	if base_vote_url == "":
	self.base_vote_url ="https://ara.jri.ac.ir/Judge/Text/"
	else:
	self.base_vote_url = base_vote_url

	if models_path == "":
	self.models_path ="Models/"
	else:
	self.models_path = models_path
	self.pos_model_path = os.path.join(models_path, "postagger.model")
	self.chunker_path = os.path.join(models_path, "chunker.model")

	if result_path == "":
	self.result_path = "Resource/"
	else:
	self.result_path = result_path

	self.merges_vote_path = os.path.join(result_path, 'merged_vote.txt')
	self.clean_vote_path = os.path.join(result_path, 'clean_vote.txt')
	self.clean_vote_path_csv = os.path.join(result_path, 'clean_vote.csv')
	self.selected_vote_path = os.path.join(result_path, 'selected_vote.txt')
	self.law_list_path = os.path.join(result_path, 'law_list.txt')
	self.law_clean_list_path = os.path.join(result_path, 'law_clean_list.txt')
	self.vote_stop_path = os.path.join(result_path, "vote_stopwords.txt")
	self.law_stop_path = os.path.join(result_path, "law_stopwords.txt")

	@staticmethod
	def check_valid_vote(html_soup: BeautifulSoup) -> bool:
	# Extract title for detection of non-valid vote
	h1_element = html_soup.find('h1', class_='Title3D')
	if h1_element is None:
	return False
	span_text = h1_element.find('span').text # Text within the <span> tag
	full_text = h1_element.text # Full text within the <h1> element
	text_after_span = full_text.split(span_text)[-1].strip() # Extract text after the </span> tag
	return len(text_after_span) > 0

	@staticmethod
	def html_data_extractor(html_soup: BeautifulSoup, vote_splitter: str) -> str:
	vote_text = html_soup.find('div', id='treeText', class_='BackText')
	title = html_soup.find('h1', class_='Title3D')
	info = html_soup.find('td', valign="top", class_="font-size-small")
	# for separating each vote in file use vote_splitter
	vote_df = str(title) + str(info) + str(vote_text) + vote_splitter
	return vote_df

	def vote_crawler(self, start: int, end: int, separator: int):
	counter = 0 # For counting right votes crawled
	result_list = []
	warnings.filterwarnings("ignore")
	# Loop for sending request to get each vote page
	for i in tqdm(range(start, end)):
	# Save every separator records gotten in .txt format
	if (counter % separator == 0 and counter > 0) or i == end - 1:
	text_file = open(os.path.join(self.result_path, f'vote{i}.txt'), "w", encoding='utf-8')
	text_file.write(''.join(result_list))
	text_file.close()
	result_list = []
	url = self.base_vote_url + f"{i}"
	response = requests.get(url, verify=False)
	# Change format for Persian text
	response.encoding = 'utf-8'
	resp_text = response.text
	html_soup = BeautifulSoup(resp_text, 'html.parser')
	if response.ok and self.check_valid_vote(html_soup):
	counter += 1
	vote_df = self.html_data_extractor(html_soup, self.vote_splitter)
	result_list.append(vote_df)

	def merge_out_txt(self) -> None:

	with open(self.result_path, 'w', encoding='utf-8') as outfile:
	for filename in os.listdir(self.merges_vote_path):
	if filename.startswith("vote") and filename.endswith('.txt'): # Only merge vote .txt
	with open(os.path.join(self.merges_vote_path, filename), 'r', encoding='utf-8') as infile:
	outfile.write(infile.read())

	if __name__ == "__main__":
	models_path = input("Enter the models path (initial value = https://ara.jri.ac.ir/): ")
	result_path = input("Enter the result path (initial value = https://ara.jri.ac.ir/Judge/Index): ")
	base_url = input("Enter the base URL (initial value = https://ara.jri.ac.ir/Judge/Text/): ")
	list_url = input("Enter the list URL (initial value = Models/ ): ")
	base_vote_url = input("Enter the base vote URL (initial value = Resource/ ): ")

	crawler_instance = Crawler(models_path=models_path, result_path=result_path, base_url=base_url, list_url=list_url, base_vote_url=base_vote_url)
	start = int(input("Enter the start value for vote crawling: "))
	end = int(input("Enter the end value for vote crawling: "))
	separator = int(input("Enter the separator value for vote crawling: "))

	crawler_instance.vote_crawler(start=start, end=end, separator=separator)
	crawler_instance.merge_out_txt()