Spaces:
Runtime error
Runtime error
| import re | |
| import unicodedata | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def retrieve_parsed_doc(patent_information, summaries_generated): | |
| try: | |
| language_config = "en" | |
| if "https" in patent_information: | |
| patent_code = patent_information.split("/")[4] | |
| else: | |
| patent_code = patent_information | |
| URL = f"https://patents.google.com/patent/{patent_code}/{language_config}" | |
| page = requests.get(URL) | |
| soup = BeautifulSoup(page.content, 'lxml') | |
| if "Abstract" in summaries_generated: | |
| abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify()) | |
| else: | |
| abstract = None | |
| if "Background" in summaries_generated: | |
| background = clean_text(soup.find_all(itemprop="description", | |
| itemscope="")[-1:][0].prettify()) | |
| else: | |
| background = None | |
| if "Claims" in summaries_generated: | |
| claims = soup.find(itemprop="claims") | |
| main_claim = claims.find_all({"div":{"class":"claim"}}) | |
| main_claims = main_claim[0].select("div[class=claim]") | |
| formatted_claims = set() | |
| for i in main_claims: | |
| formatted_claims.add(clean_text(i.prettify())) | |
| try: | |
| formatted_claims.remove('') | |
| except: | |
| pass | |
| claim_list = sorted(list(formatted_claims), key=len, reverse=True) | |
| else: | |
| claim_list = None | |
| return [abstract, background, claim_list] | |
| except Exception as e: | |
| print(f'[ERROR] {e}') | |
| return None | |
| def get_word_index(s, limit): | |
| try: | |
| words = re.findall(r'\s*\S+\s*', s) | |
| return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip()) | |
| except: | |
| l = len(s) | |
| chr_limit = 3500 | |
| return l if l < chr_limit else chr_limit | |
| def post_process(s): | |
| # Basic post-processing | |
| if s[0] == " ": s = s[1:] | |
| s = s.replace("- ", "-").replace(" .", ".") | |
| return ".".join(s.split(".")[:-1])+"." | |
| def clean_text(text): | |
| # TODO: optimize text cleaning | |
| reg = re.compile(r'<.*?>') | |
| cleaned = reg.sub('', text) | |
| cleaned = re.sub(r'\([^)]*\)', '', cleaned) | |
| cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned) | |
| cleaned = cleaned.strip() | |
| cleaned = cleaned.lstrip() | |
| cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C") | |
| cleaned = re.sub(' +', ' ', cleaned) | |
| cleaned = cleaned.replace(";", ", and") | |
| cleaned = cleaned.replace(":", "") | |
| cleaned = cleaned.replace(" .", ".") | |
| cleaned = cleaned.replace(" ,", ",") | |
| cleaned = cleaned.replace("\xa0", " ") | |
| cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start | |
| cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words | |
| return cleaned |