Spaces:
Sleeping
Sleeping
| import requests | |
| import bz2 | |
| import xml.etree.ElementTree as ET | |
| import os | |
| import pickle | |
| from tqdm import tqdm | |
| import mwparserfromhell | |
| # Step 1: Download the latest dump | |
| DUMP_URL = "https://dumps.wikimedia.org/tewiktionary/latest/tewiktionary-latest-pages-articles.xml.bz2" | |
| response = requests.get(DUMP_URL, stream=True) | |
| print("Downloading the latest dump...") | |
| total_size = int(response.headers.get('content-length', 0)) | |
| progress_bar = tqdm(total=total_size, unit='B', unit_scale=True) | |
| dump_file = "tewiktionary-latest-pages-articles.xml.bz2" | |
| with open(dump_file, 'wb') as file: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| progress_bar.update(len(chunk)) | |
| file.write(chunk) | |
| progress_bar.close() | |
| # Step 2: Extract the dump | |
| print("\nExtracting the dump...") | |
| with bz2.open(dump_file, 'rb') as source, open(dump_file[:-4], 'wb') as dest: | |
| for line in source: | |
| dest.write(line) | |
| # Step 3: Parse the XML dump and extract translations | |
| print("Parsing the XML dump to extract translations...") | |
| tree = ET.parse(dump_file[:-4]) | |
| root = tree.getroot() | |
| ns = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'} | |
| translations = {} | |
| for page in root.findall('ns:page', ns): | |
| title = page.find('ns:title', ns).text | |
| revision = page.find('ns:revision', ns) | |
| if revision: | |
| text_data = revision.find('ns:text', ns) | |
| if text_data and text_data.text: | |
| # Parse the wikitext | |
| wikicode = mwparserfromhell.parse(text_data.text) | |
| links = [link.title for link in wikicode.filter_wikilinks() if link.title.startswith("en:")] | |
| if links: | |
| english_translations = [str(link.split(':')[1]) for link in links] | |
| translations[title] = english_translations | |
| # Display the first 1000 translations | |
| print("\nDisplaying the first 1000 translations:") | |
| for i, (telugu_word, english_words) in enumerate(translations.items()): | |
| if i >= 1000: | |
| break | |
| print(f"Telugu Word: {telugu_word}, English Translations: {', '.join(english_words)}") | |
| # Save the translations to a pickle file | |
| print("\nSaving translations to pickle file...") | |
| pickle_filename = "telugu_english_translations.pkl" | |
| with open(pickle_filename, 'wb') as file: | |
| pickle.dump(translations, file) | |
| print(f"Translations saved to {pickle_filename}") | |
| # Optional: Remove the downloaded files if you want | |
| # os.remove(dump_file) | |
| # os.remove(dump_file[:-4]) | |