Spaces:
Running
Running
| from langchain.docstore.document import Document | |
| import re | |
| def under_non_alpha_ratio(text: str, threshold: float = 0.5): | |
| """Checks if the proportion of non-alpha characters in the text snippet exceeds a given | |
| threshold. This helps prevent text like "-----------BREAK---------" from being tagged | |
| as a title or narrative text. The ratio does not count spaces. | |
| Parameters | |
| ---------- | |
| text | |
| The input string to test | |
| threshold | |
| If the proportion of non-alpha characters exceeds this threshold, the function | |
| returns False | |
| """ | |
| if len(text) == 0: | |
| return False | |
| alpha_count = len([char for char in text if char.strip() and char.isalpha()]) | |
| total_count = len([char for char in text if char.strip()]) | |
| try: | |
| ratio = alpha_count / total_count | |
| return ratio < threshold | |
| except: | |
| return False | |
| def is_possible_title( | |
| text: str, | |
| title_max_word_length: int = 20, | |
| non_alpha_threshold: float = 0.5, | |
| ) -> bool: | |
| """Checks to see if the text passes all of the checks for a valid title. | |
| Parameters | |
| ---------- | |
| text | |
| The input text to check | |
| title_max_word_length | |
| The maximum number of words a title can contain | |
| non_alpha_threshold | |
| The minimum number of alpha characters the text needs to be considered a title | |
| """ | |
| # 文本长度为0的话,肯定不是title | |
| if len(text) == 0: | |
| print("Not a title. Text is empty.") | |
| return False | |
| # 文本中有标点符号,就不是title | |
| ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" | |
| ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) | |
| if ENDS_IN_PUNCT_RE.search(text) is not None: | |
| return False | |
| # 文本长度不能超过设定值,默认20 | |
| # NOTE(robinson) - splitting on spaces here instead of word tokenizing because it | |
| # is less expensive and actual tokenization doesn't add much value for the length check | |
| if len(text) > title_max_word_length: | |
| return False | |
| # 文本中数字的占比不能太高,否则不是title | |
| if under_non_alpha_ratio(text, threshold=non_alpha_threshold): | |
| return False | |
| # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles | |
| if text.endswith((",", ".", ",", "。")): | |
| return False | |
| if text.isnumeric(): | |
| print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore | |
| return False | |
| # 开头的字符内应该有数字,默认5个字符内 | |
| if len(text) < 5: | |
| text_5 = text | |
| else: | |
| text_5 = text[:5] | |
| alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5)))) | |
| if not alpha_in_text_5: | |
| return False | |
| return True | |
| def zh_title_enhance(docs: Document) -> Document: | |
| title = None | |
| if len(docs) > 0: | |
| for doc in docs: | |
| if is_possible_title(doc.page_content): | |
| doc.metadata['category'] = 'cn_Title' | |
| title = doc.page_content | |
| elif title: | |
| doc.page_content = f"下文与({title})有关。{doc.page_content}" | |
| return docs | |
| else: | |
| print("文件不存在") | |