Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| class LawTxetPreProcessor(): | |
| def __init__(self, law_texts: list) -> None: | |
| self._law_texets = law_texts | |
| self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"]) | |
| self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"]) | |
| self._is_df = False | |
| def build_df(self): | |
| title_list = [] | |
| madeh_list = [] | |
| madeh_index = [] | |
| law_index = [] | |
| counter = 0 | |
| for text in self._law_texets: | |
| title = self.title_extractor(text) | |
| title_list.append(title) | |
| temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران") | |
| law_index.extend([counter for i in temp_madeh_list]) | |
| madeh_index.extend([i+1 for i in range(len(temp_madeh_list))]) | |
| madeh_list.extend(temp_madeh_list) | |
| counter += 1 | |
| law_index_list = [i for i in range(counter)] | |
| self._madeh_df = pd.DataFrame({"law_index": law_index, | |
| "madeh_index": madeh_index, | |
| "madeh_text": madeh_list}) | |
| self._law_name_df = pd.DataFrame({"law_index": law_index_list, | |
| "law_name": title_list}) | |
| def title_extractor(self, law_text: str) -> str: | |
| first_newline_index = law_text.find('\n') | |
| return law_text[:first_newline_index] | |
| def madeh_extractor(self, law_text: str, is_asl:False)-> list: | |
| result = [] | |
| pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)" | |
| removed_regex = r"❯.*\n" | |
| notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)" | |
| cleaned_text = re.sub(removed_regex, "", law_text) | |
| matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE) | |
| not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE) | |
| indices = [match.start() for match in matches] | |
| not_valid_indices = [match.start() for match in not_valid_matches] | |
| valid_indices = [item for item in indices if item not in not_valid_indices] | |
| for i in range(len(valid_indices)): | |
| start = valid_indices[i] | |
| if i != len(valid_indices)-1: | |
| end = valid_indices[i+1] | |
| result.append(cleaned_text[start:end]) | |
| else: | |
| result.append(cleaned_text[start:]) | |
| return result | |
| def get_df(self) -> pd.DataFrame: | |
| if not self._is_df: | |
| self.build_df() | |
| return self._law_name_df, self._madeh_df |