Spaces:
Running
Running
| import os, warnings, requests | |
| os.environ["CURL_CA_BUNDLE"] = '' | |
| from dotenv import load_dotenv | |
| from huggingface_hub import configure_http_backend | |
| def backend_factory() -> requests.Session: | |
| session = requests.Session() | |
| session.verify = False | |
| return session | |
| configure_http_backend(backend_factory=backend_factory) | |
| warnings.filterwarnings("ignore") | |
| load_dotenv() | |
| import bm25s | |
| from bm25s.hf import BM25HF | |
| from datasets import load_dataset | |
| unique_specs = set() | |
| dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent") | |
| dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata") | |
| dataset_text = dataset_text["train"].to_list() | |
| dataset_metadata = dataset_metadata["train"].to_list() | |
| corpus_json = [] | |
| def get_document(spec_id: str, spec_title: str): | |
| text = [f"{spec_id} - {spec_title}\n"] | |
| for section in dataset_text: | |
| if spec_id == section["doc_id"]: | |
| text.extend([f"{section['section']}\n\n{section['content']}"]) | |
| return text | |
| for specification in dataset_metadata: | |
| if specification['id'] in unique_specs: continue | |
| for section in dataset_text: | |
| if specification['id'] == section['doc_id']: | |
| corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": { | |
| "id": specification['id'], | |
| "title": specification['title'], | |
| "section_title": section['section'], | |
| "version": specification['version'], | |
| "type": specification['type'], | |
| "working_group": specification['working_group'], | |
| "url": specification['url'], | |
| "scope": specification['scope'] | |
| }}) | |
| unique_specs.add(specification['id']) | |
| corpus_text = [doc["text"] for doc in corpus_json] | |
| corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
| retriever = BM25HF(corpus=corpus_json) | |
| retriever.index(corpus_tokens) | |
| retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF_TOKEN")) | |
| unique_specs = set() | |
| corpus_json = [] | |
| for specification in dataset_metadata: | |
| if specification['id'] in unique_specs: continue | |
| text_list = get_document(specification['id'], specification['title']) | |
| text = "\n".join(text_list) | |
| if len(text_list) == 1: continue | |
| corpus_json.append({"text": text, "metadata": specification}) | |
| unique_specs.add(specification['id']) | |
| corpus_text = [doc["text"] for doc in corpus_json] | |
| corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
| retriever = BM25HF(corpus=corpus_json) | |
| retriever.index(corpus_tokens) | |
| retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF_TOKEN")) |