Spaces:
Sleeping
Sleeping
| import shutil | |
| import zipfile | |
| import json | |
| import bm25s | |
| import nltk | |
| from nltk.stem import WordNetLemmatizer | |
| nltk.download("wordnet") | |
| lemmatizer = WordNetLemmatizer() | |
| indexer_id = "3gpp_bm25_docs" | |
| unique_specs = set() | |
| with open("indexed_specifications.json", "r") as f: | |
| spec_data = json.load(f)["specs"] | |
| with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf: | |
| for file_name in zf.namelist(): | |
| if file_name.endswith(".json"): | |
| doc_bytes = zf.read(file_name) | |
| try: | |
| doc_data = json.loads(doc_bytes.decode("utf-8")) | |
| print("Documents loaded successfully !") | |
| except json.JSONDecodeError as e: | |
| print(f"Error while decoding the JSON file {file_name}: {e}") | |
| corpus_json = [] | |
| for _, specification in spec_data.items(): | |
| full_text = f"{specification['id']} - {specification['title']}\n\n\n" | |
| if specification['id'] in unique_specs: | |
| continue | |
| document = doc_data.get(specification['id'], None) | |
| if document is None: continue | |
| if not isinstance(document, str): | |
| full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()]) | |
| corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": { | |
| "id": specification['id'], | |
| "title": specification['title'], | |
| "version": specification['version'], | |
| "release": specification['release'], | |
| "type": specification['type'], | |
| "working_group": specification['working_group'], | |
| "url": specification['url'], | |
| "scope": specification['scope'] | |
| }}) | |
| unique_specs.add(specification['id']) | |
| else: | |
| print(f"Skipping {specification['id']}") | |
| unique_specs.add(specification['id']) | |
| corpus_text = [doc["text"] for doc in corpus_json] | |
| corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
| retriever = bm25s.BM25(corpus=corpus_json) | |
| retriever.index(corpus_tokens) | |
| retriever.save(indexer_id) | |
| shutil.make_archive("bm25s", 'zip', '.', indexer_id) |