Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import warnings | |
| from tqdm import tqdm | |
| class Crawler: | |
| # This is used for vote separating when list of vote concatenation in string | |
| vote_splitter = " |split| " | |
| def __init__(self, base_url: str, list_url:str , | |
| base_vote_url:str , models_path: str , result_path:str): | |
| if base_url == "": | |
| self.base_url ="https://ara.jri.ac.ir/" | |
| else: | |
| self.base_url = base_url | |
| if list_url == "": | |
| self.list_url ="https://ara.jri.ac.ir/Judge/Index" | |
| else: | |
| self.list_url = list_url | |
| if base_vote_url == "": | |
| self.base_vote_url ="https://ara.jri.ac.ir/Judge/Text/" | |
| else: | |
| self.base_vote_url = base_vote_url | |
| if models_path == "": | |
| self.models_path ="Models/" | |
| else: | |
| self.models_path = models_path | |
| self.pos_model_path = os.path.join(models_path, "postagger.model") | |
| self.chunker_path = os.path.join(models_path, "chunker.model") | |
| if result_path == "": | |
| self.result_path = "Resource/" | |
| else: | |
| self.result_path = result_path | |
| self.merges_vote_path = os.path.join(result_path, 'merged_vote.txt') | |
| self.clean_vote_path = os.path.join(result_path, 'clean_vote.txt') | |
| self.clean_vote_path_csv = os.path.join(result_path, 'clean_vote.csv') | |
| self.selected_vote_path = os.path.join(result_path, 'selected_vote.txt') | |
| self.law_list_path = os.path.join(result_path, 'law_list.txt') | |
| self.law_clean_list_path = os.path.join(result_path, 'law_clean_list.txt') | |
| self.vote_stop_path = os.path.join(result_path, "vote_stopwords.txt") | |
| self.law_stop_path = os.path.join(result_path, "law_stopwords.txt") | |
| def check_valid_vote(html_soup: BeautifulSoup) -> bool: | |
| # Extract title for detection of non-valid vote | |
| h1_element = html_soup.find('h1', class_='Title3D') | |
| if h1_element is None: | |
| return False | |
| span_text = h1_element.find('span').text # Text within the <span> tag | |
| full_text = h1_element.text # Full text within the <h1> element | |
| text_after_span = full_text.split(span_text)[-1].strip() # Extract text after the </span> tag | |
| return len(text_after_span) > 0 | |
| def html_data_extractor(html_soup: BeautifulSoup, vote_splitter: str) -> str: | |
| vote_text = html_soup.find('div', id='treeText', class_='BackText') | |
| title = html_soup.find('h1', class_='Title3D') | |
| info = html_soup.find('td', valign="top", class_="font-size-small") | |
| # for separating each vote in file use vote_splitter | |
| vote_df = str(title) + str(info) + str(vote_text) + vote_splitter | |
| return vote_df | |
| def vote_crawler(self, start: int, end: int, separator: int): | |
| counter = 0 # For counting right votes crawled | |
| result_list = [] | |
| warnings.filterwarnings("ignore") | |
| # Loop for sending request to get each vote page | |
| for i in tqdm(range(start, end)): | |
| # Save every separator records gotten in .txt format | |
| if (counter % separator == 0 and counter > 0) or i == end - 1: | |
| text_file = open(os.path.join(self.result_path, f'vote{i}.txt'), "w", encoding='utf-8') | |
| text_file.write(''.join(result_list)) | |
| text_file.close() | |
| result_list = [] | |
| url = self.base_vote_url + f"{i}" | |
| response = requests.get(url, verify=False) | |
| # Change format for Persian text | |
| response.encoding = 'utf-8' | |
| resp_text = response.text | |
| html_soup = BeautifulSoup(resp_text, 'html.parser') | |
| if response.ok and self.check_valid_vote(html_soup): | |
| counter += 1 | |
| vote_df = self.html_data_extractor(html_soup, self.vote_splitter) | |
| result_list.append(vote_df) | |
| def merge_out_txt(self) -> None: | |
| with open(self.result_path, 'w', encoding='utf-8') as outfile: | |
| for filename in os.listdir(self.merges_vote_path): | |
| if filename.startswith("vote") and filename.endswith('.txt'): # Only merge vote .txt | |
| with open(os.path.join(self.merges_vote_path, filename), 'r', encoding='utf-8') as infile: | |
| outfile.write(infile.read()) | |
| if __name__ == "__main__": | |
| models_path = input("Enter the models path (initial value = https://ara.jri.ac.ir/): ") | |
| result_path = input("Enter the result path (initial value = https://ara.jri.ac.ir/Judge/Index): ") | |
| base_url = input("Enter the base URL (initial value = https://ara.jri.ac.ir/Judge/Text/): ") | |
| list_url = input("Enter the list URL (initial value = Models/ ): ") | |
| base_vote_url = input("Enter the base vote URL (initial value = Resource/ ): ") | |
| crawler_instance = Crawler(models_path=models_path, result_path=result_path, base_url=base_url, list_url=list_url, base_vote_url=base_vote_url) | |
| start = int(input("Enter the start value for vote crawling: ")) | |
| end = int(input("Enter the end value for vote crawling: ")) | |
| separator = int(input("Enter the separator value for vote crawling: ")) | |
| crawler_instance.vote_crawler(start=start, end=end, separator=separator) | |
| crawler_instance.merge_out_txt() | |