Spaces:
Running
Running
add app
Browse files- README.md +4 -4
- app.py +293 -0
- default_wiki_pipeline.py +1273 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FineWiki Viewer
|
| 3 |
+
emoji: 🌐
|
| 4 |
+
colorFrom: white
|
| 5 |
+
colorTo: black
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
app.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
# _LOCAL_TMP = "/fsx/guilherme/tmp"
|
| 3 |
+
# try:
|
| 4 |
+
# os.makedirs(_LOCAL_TMP, exist_ok=True)
|
| 5 |
+
# os.environ.setdefault("TMPDIR", _LOCAL_TMP)
|
| 6 |
+
# os.environ.setdefault("TEMP", _LOCAL_TMP)
|
| 7 |
+
# os.environ.setdefault("TMP", _LOCAL_TMP)
|
| 8 |
+
# _GRADIO_TMP = os.path.join(_LOCAL_TMP, "gradio")
|
| 9 |
+
# os.makedirs(_GRADIO_TMP, exist_ok=True)
|
| 10 |
+
# os.environ.setdefault("GRADIO_TEMP_DIR", _GRADIO_TMP)
|
| 11 |
+
# except Exception:
|
| 12 |
+
# pass
|
| 13 |
+
|
| 14 |
+
import gradio as gr
|
| 15 |
+
from datatrove.pipeline.readers import ParquetReader
|
| 16 |
+
from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
|
| 17 |
+
|
| 18 |
+
lang_list = ['ab', 'ace', 'ady', 'af', 'als', 'alt', 'ami', 'am', 'ang', 'anp', 'an', 'arc', 'ar', 'ary', 'arz', 'ast', 'as', 'atj', 'avk', 'av', 'awa', 'ay', 'azb', 'az', 'ban', 'bar', 'bat_smg', 'ba', 'bbc', 'bcl', 'be', 'bg', 'bh', 'bi', 'bjn', 'blk', 'bm', 'bn', 'bo', 'bpy', 'br', 'bs', 'bug', 'bxr', 'ca', 'cbk_zam', 'cdo', 'ceb', 'ce', 'chr', 'ch', 'chy', 'ckb', 'co', 'crh', 'cr', 'csb', 'cs', 'cu', 'cv', 'cy', 'dag', 'da', 'de', 'dga', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz', 'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fat', 'fa', 'ff', 'fiu_vro', 'fi', 'fj', 'fon', 'fo', 'frp', 'frr', 'fr', 'fur', 'fy', 'gag', 'gan', 'ga', 'gcr', 'gd', 'glk', 'gl', 'gn', 'gom', 'gor', 'got', 'gpe', 'guc', 'gur', 'gu', 'guw', 'gv', 'hak', 'ha', 'haw', 'he', 'hif', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'ie', 'ig', 'ik', 'ilo', 'inh', 'io', 'is', 'it', 'iu', 'jam', 'ja', 'jbo', 'jv', 'kaa', 'kab', 'ka', 'kbd', 'kbp', 'kcg', 'kg', 'ki', 'kk', 'kl', 'km', 'kn', 'koi', 'ko', 'krc', 'ksh', 'ks', 'ku', 'kv', 'kw', 'ky', 'lad', 'la', 'lbe', 'lb', 'lez', 'lfn', 'lg', 'lij', 'li', 'lld', 'lmo', 'ln', 'lo', 'ltg', 'lt', 'lv', 'mad', 'mai', 'map_bms', 'mdf', 'mg', 'mhr', 'min', 'mi', 'mk', 'ml', 'mni', 'mn', 'mnw', 'mrj', 'mr', 'ms', 'mt', 'mwl', 'myv', 'my', 'mzn', 'nah', 'nap', 'nds_nl', 'nds', 'ne', 'new', 'nia', 'nl', 'nn', 'nov', 'no', 'nqo', 'nrm', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pag', 'pam', 'pap', 'pa', 'pcd', 'pcm', 'pdc', 'pfl', 'pih', 'pi', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'pwn', 'qu', 'rm', 'rmy', 'rn', 'roa_rup', 'roa_tara', 'ro', 'rue', 'ru', 'rw', 'sah', 'sat', 'sa', 'scn', 'sco', 'sc', 'sd', 'se', 'sg', 'shi', 'shn', 'sh', 'simple', 'si', 'skr', 'sk', 'sl', 'smn', 'sm', 'sn', 'so', 'sq', 'srn', 'sr', 'ss', 'stq', 'st', 'su', 'sv', 'sw', 'szl', 'szy', 'ta', 'tay', 'tcy', 'tet', 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tly', 'tn', 'to', 'tpi', 'trv', 'tr', 'ts', 'tt', 'tum', 'tw', 'tyv', 'ty', 'udm', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 've', 'vi', 'vls', 'vo', 'war', 'wa', 'wo', 'wuu', 'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zgh', 'zh_classical', 'zh_min_nan', 'zh_yue', 'zh', 'zu']
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _build_header_markdown(doc) -> str:
|
| 22 |
+
meta = doc.metadata or {}
|
| 23 |
+
title = meta.get("title") or ""
|
| 24 |
+
page_id = meta.get("page_id") or meta.get("id") or ""
|
| 25 |
+
wikidata_id = meta.get("wikidata_id") or ""
|
| 26 |
+
url = meta.get("url") or ""
|
| 27 |
+
parts = []
|
| 28 |
+
if title:
|
| 29 |
+
parts.append(f"**Title**: {title}")
|
| 30 |
+
if page_id:
|
| 31 |
+
parts.append(f"**Page ID**: {page_id}")
|
| 32 |
+
if wikidata_id:
|
| 33 |
+
parts.append(f"**Wikidata ID**: {wikidata_id}")
|
| 34 |
+
header = " | ".join(parts)
|
| 35 |
+
if url:
|
| 36 |
+
header += f"\n[{url}]({url})"
|
| 37 |
+
return header
|
| 38 |
+
def matches_filters(doc, require_has_math: bool | None, require_has_infobox: bool | None) -> bool:
|
| 39 |
+
meta = doc.metadata or {}
|
| 40 |
+
if require_has_math and not bool(meta.get("has_math")):
|
| 41 |
+
return False
|
| 42 |
+
if require_has_infobox and not meta.get("infoboxes"):
|
| 43 |
+
return False
|
| 44 |
+
return True
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def find_next_matching_from(docs_cache, reader_iter, start_idx: int, require_has_math: bool | None, require_has_infobox: bool | None):
|
| 48 |
+
# Scan cache first
|
| 49 |
+
i = max(-1, start_idx)
|
| 50 |
+
while i + 1 < len(docs_cache):
|
| 51 |
+
i += 1
|
| 52 |
+
if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
|
| 53 |
+
return i, docs_cache, reader_iter
|
| 54 |
+
# Stream until found or exhausted
|
| 55 |
+
while True:
|
| 56 |
+
prev_len = len(docs_cache)
|
| 57 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
|
| 58 |
+
if len(docs_cache) == prev_len:
|
| 59 |
+
break
|
| 60 |
+
if matches_filters(docs_cache[-1], require_has_math, require_has_infobox):
|
| 61 |
+
return len(docs_cache) - 1, docs_cache, reader_iter
|
| 62 |
+
return -1, docs_cache, reader_iter
|
| 63 |
+
|
| 64 |
+
def render_iframe(url: str, height: int = 800) -> str:
|
| 65 |
+
safe_url = url or "about:blank"
|
| 66 |
+
return (
|
| 67 |
+
f'<iframe src="{safe_url}" '
|
| 68 |
+
f'style="width:100%; height:{height}px; border:0;" loading="lazy"></iframe>'
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _safe_url_from_metadata(meta: dict) -> str:
|
| 73 |
+
meta = meta or {}
|
| 74 |
+
return meta.get("url") or ""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _extract_language(meta: dict) -> str:
|
| 78 |
+
# Try common metadata fields for language code
|
| 79 |
+
meta = meta or {}
|
| 80 |
+
lang = meta.get("lang") or meta.get("language")
|
| 81 |
+
if lang:
|
| 82 |
+
return str(lang)
|
| 83 |
+
wiki = meta.get("wiki") or meta.get("wikiname") or ""
|
| 84 |
+
base = str(wiki).removesuffix("_namespace_0") if wiki else ""
|
| 85 |
+
if base.endswith("wiki"):
|
| 86 |
+
return base[:-4]
|
| 87 |
+
return base or "en"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _ensure_until_index(docs_cache, reader_iter, target_idx: int):
|
| 91 |
+
if reader_iter is None:
|
| 92 |
+
return docs_cache, reader_iter
|
| 93 |
+
while len(docs_cache) <= target_idx:
|
| 94 |
+
try:
|
| 95 |
+
nxt = next(reader_iter)
|
| 96 |
+
except StopIteration:
|
| 97 |
+
break
|
| 98 |
+
docs_cache.append(nxt)
|
| 99 |
+
return docs_cache, reader_iter
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def on_select_language(lang: str, require_has_math: bool, require_has_infobox: bool):
|
| 103 |
+
"""Load documents for the selected language from HF Parquet and display."""
|
| 104 |
+
language = (lang or "").strip()
|
| 105 |
+
if not language:
|
| 106 |
+
return (-1, [], None, "Select a language.", {}, "", [], render_iframe(""))
|
| 107 |
+
try:
|
| 108 |
+
path = f"hf://datasets/HuggingFaceFW/finewiki/data/{language}wiki"
|
| 109 |
+
reader_iter = ParquetReader(path)()
|
| 110 |
+
except Exception as e:
|
| 111 |
+
return (-1, [], None, f"Failed to read: {e}", {}, "", [], render_iframe(""))
|
| 112 |
+
docs_cache = []
|
| 113 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
|
| 114 |
+
if not docs_cache:
|
| 115 |
+
return (-1, [], reader_iter, "No documents found.", {}, "", [], render_iframe(""))
|
| 116 |
+
# Find first doc matching filters (starting before 0)
|
| 117 |
+
idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
|
| 118 |
+
if idx == -1:
|
| 119 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 120 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, idx)
|
| 121 |
+
return (idx, docs_cache, reader_iter, left, left_meta, header, md, info, right)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def on_find(docs_cache, idx: int, reader_iter, id_query: str, require_has_math: bool, require_has_infobox: bool):
|
| 125 |
+
query = (id_query or "").strip()
|
| 126 |
+
if not docs_cache and reader_iter is None:
|
| 127 |
+
return -1, docs_cache, reader_iter, "No documents loaded.", {}, "", [], render_iframe("")
|
| 128 |
+
if not query:
|
| 129 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
|
| 130 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
|
| 131 |
+
if new_idx == -1:
|
| 132 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 133 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 134 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 135 |
+
# Exact match in cache
|
| 136 |
+
for i, doc in enumerate(docs_cache):
|
| 137 |
+
meta = (getattr(doc, "metadata", None) or {})
|
| 138 |
+
doc_id = (getattr(doc, "id", None) or "")
|
| 139 |
+
url = meta.get("url") or ""
|
| 140 |
+
if doc_id == query or meta.get("wikidata_id") == query or url == query:
|
| 141 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, i)
|
| 142 |
+
if matches_filters(doc, require_has_math, require_has_infobox):
|
| 143 |
+
return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 144 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
|
| 145 |
+
if new_idx == -1:
|
| 146 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 147 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 148 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 149 |
+
# Suffix match in cache
|
| 150 |
+
for i, doc in enumerate(docs_cache):
|
| 151 |
+
doc_id = (getattr(doc, "id", None) or "")
|
| 152 |
+
meta = (getattr(doc, "metadata", None) or {})
|
| 153 |
+
url = meta.get("url") or ""
|
| 154 |
+
if doc_id.endswith(f"/{query}") or url.endswith(query):
|
| 155 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, i)
|
| 156 |
+
if matches_filters(doc, require_has_math, require_has_infobox):
|
| 157 |
+
return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 158 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
|
| 159 |
+
if new_idx == -1:
|
| 160 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 161 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 162 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 163 |
+
# Stream forward until found or exhausted
|
| 164 |
+
found_idx = None
|
| 165 |
+
while True:
|
| 166 |
+
prev_len = len(docs_cache)
|
| 167 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
|
| 168 |
+
if len(docs_cache) == prev_len:
|
| 169 |
+
break
|
| 170 |
+
doc = docs_cache[-1]
|
| 171 |
+
meta = (getattr(doc, "metadata", None) or {})
|
| 172 |
+
doc_id = (getattr(doc, "id", None) or "")
|
| 173 |
+
url = meta.get("url") or ""
|
| 174 |
+
if doc_id == query or meta.get("wikidata_id") == query or url.endswith(query) or url == query or doc_id.endswith(f"/{query}"):
|
| 175 |
+
found_idx = len(docs_cache) - 1
|
| 176 |
+
break
|
| 177 |
+
if found_idx is not None:
|
| 178 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, found_idx - 1, require_has_math, require_has_infobox)
|
| 179 |
+
if new_idx == -1:
|
| 180 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 181 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 182 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 183 |
+
target_idx = 0 if docs_cache else -1
|
| 184 |
+
if target_idx == -1:
|
| 185 |
+
return -1, docs_cache, reader_iter, "No documents found.", {}, "", [], render_iframe("")
|
| 186 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, target_idx - 1, require_has_math, require_has_infobox)
|
| 187 |
+
if new_idx == -1:
|
| 188 |
+
return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
|
| 189 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 190 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def show_doc(doc):
|
| 194 |
+
left = getattr(doc, "text", "")
|
| 195 |
+
meta = getattr(doc, "metadata", None) or {}
|
| 196 |
+
# Clean markdown using default_wiki_pipeline helper
|
| 197 |
+
md_text = meta.get("wikitext")
|
| 198 |
+
md_clean = _parse_and_clean_wikicode(md_text, parser=mwparserfromhell, language=_extract_language(meta))
|
| 199 |
+
info = meta.get("infoboxes", [])
|
| 200 |
+
right = render_iframe(_safe_url_from_metadata(meta))
|
| 201 |
+
header = _build_header_markdown(doc)
|
| 202 |
+
return left, meta, md_clean, info, right, header
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def render_idx(docs, idx: int):
|
| 206 |
+
if not docs:
|
| 207 |
+
return "No documents.", {}, "", [], render_iframe(""), ""
|
| 208 |
+
idx = max(0, min(idx, len(docs) - 1))
|
| 209 |
+
doc = docs[idx]
|
| 210 |
+
left, left_meta, md, info, right, header = show_doc(doc)
|
| 211 |
+
return left, left_meta, md, info, right, header
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def on_prev(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
|
| 215 |
+
if not docs_cache:
|
| 216 |
+
# Try to ensure at least first doc is loaded
|
| 217 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
|
| 218 |
+
if not docs_cache:
|
| 219 |
+
return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
|
| 220 |
+
new_idx = max(0, idx - 1)
|
| 221 |
+
# Apply filters going backwards by scanning from start to new_idx
|
| 222 |
+
filtered_idx = new_idx
|
| 223 |
+
if new_idx >= 0:
|
| 224 |
+
for i in range(new_idx, -1, -1):
|
| 225 |
+
if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
|
| 226 |
+
filtered_idx = i
|
| 227 |
+
break
|
| 228 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, filtered_idx)
|
| 229 |
+
return filtered_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def on_next(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
|
| 233 |
+
target_idx = idx + 1 if idx >= 0 else 0
|
| 234 |
+
docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, target_idx)
|
| 235 |
+
if not docs_cache:
|
| 236 |
+
return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
|
| 237 |
+
new_idx = min(len(docs_cache) - 1, target_idx)
|
| 238 |
+
# Apply filters forward
|
| 239 |
+
new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, idx, require_has_math, require_has_infobox)
|
| 240 |
+
if new_idx == -1:
|
| 241 |
+
return idx, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe("")
|
| 242 |
+
left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
|
| 243 |
+
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
with gr.Blocks() as demo:
|
| 247 |
+
idx_state = gr.State(value=-1)
|
| 248 |
+
docs_state = gr.State(value=[])
|
| 249 |
+
iter_state = gr.State(value=None)
|
| 250 |
+
|
| 251 |
+
with gr.Row():
|
| 252 |
+
# Full-width controls row for navigation
|
| 253 |
+
with gr.Column():
|
| 254 |
+
with gr.Row():
|
| 255 |
+
language_select = gr.Dropdown(choices=lang_list, value="en", label="Language")
|
| 256 |
+
with gr.Row():
|
| 257 |
+
prev_btn = gr.Button("Previous")
|
| 258 |
+
next_btn = gr.Button("Next")
|
| 259 |
+
header_md = gr.Markdown()
|
| 260 |
+
with gr.Column():
|
| 261 |
+
with gr.Row():
|
| 262 |
+
require_has_math = gr.Checkbox(label="Has math", value=False)
|
| 263 |
+
require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
|
| 264 |
+
with gr.Row():
|
| 265 |
+
id_input = gr.Textbox(label="Wikidata ID/URL/Page ID", placeholder="e.g., Q42 or https://... or 12345", lines=1)
|
| 266 |
+
find_btn = gr.Button("Find")
|
| 267 |
+
with gr.Row():
|
| 268 |
+
show_wiki = gr.Checkbox(label="Show wikimedia/wikipedia extraction", value=False)
|
| 269 |
+
show_preview = gr.Checkbox(label="Show preview", value=True)
|
| 270 |
+
show_infoboxes = gr.Checkbox(label="Show infoboxes", value=True)
|
| 271 |
+
with gr.Row():
|
| 272 |
+
with gr.Column():
|
| 273 |
+
left_text = gr.Textbox(label="FineWiki extractions", lines=30)
|
| 274 |
+
left_meta = gr.JSON(label="Metadata")
|
| 275 |
+
with gr.Column():
|
| 276 |
+
right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30)
|
| 277 |
+
right_iframe = gr.HTML(label="Original Page")
|
| 278 |
+
right_infoboxes = gr.JSON(label="Infoboxes")
|
| 279 |
+
|
| 280 |
+
language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
|
| 281 |
+
demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
|
| 282 |
+
find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
|
| 283 |
+
|
| 284 |
+
# Visibility toggles driven directly by checkbox changes
|
| 285 |
+
show_wiki.change(lambda v: gr.update(visible=v), inputs=[show_wiki], outputs=[right_markdown])
|
| 286 |
+
show_preview.change(lambda v: gr.update(visible=v), inputs=[show_preview], outputs=[right_iframe])
|
| 287 |
+
show_infoboxes.change(lambda v: gr.update(visible=v), inputs=[show_infoboxes], outputs=[right_infoboxes])
|
| 288 |
+
prev_btn.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
|
| 289 |
+
next_btn.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
if __name__ == "__main__":
|
| 293 |
+
demo.launch(server_name="0.0.0.0", server_port=7641)
|
default_wiki_pipeline.py
ADDED
|
@@ -0,0 +1,1273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# directly copied from https://huggingface.co/datasets/wikimedia/wikipedia/blob/script/wikipedia.py
|
| 2 |
+
"""Wikipedia dataset containing cleaned articles of all languages."""
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
import bz2
|
| 6 |
+
import csv
|
| 7 |
+
import io
|
| 8 |
+
import itertools
|
| 9 |
+
import json
|
| 10 |
+
import re
|
| 11 |
+
import xml.etree.ElementTree as etree
|
| 12 |
+
from urllib.parse import quote
|
| 13 |
+
|
| 14 |
+
import mwparserfromhell
|
| 15 |
+
|
| 16 |
+
import datasets
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
logger = datasets.logging.get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
_HOMEPAGE = "https://dumps.wikimedia.org"
|
| 23 |
+
|
| 24 |
+
_CITATION = """\
|
| 25 |
+
@ONLINE {wikidump,
|
| 26 |
+
author = {Wikimedia Foundation},
|
| 27 |
+
title = {Wikimedia Downloads},
|
| 28 |
+
url = {https://dumps.wikimedia.org}
|
| 29 |
+
}
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
_DESCRIPTION = """\
|
| 33 |
+
Wikipedia dataset containing cleaned articles of all languages.
|
| 34 |
+
The datasets are built from the Wikipedia dump
|
| 35 |
+
(https://dumps.wikimedia.org/) with one split per language. Each example
|
| 36 |
+
contains the content of one full Wikipedia article with cleaning to strip
|
| 37 |
+
markdown and unwanted sections (references, etc.).
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
_LICENSE = (
|
| 41 |
+
"This work is licensed under the Creative Commons Attribution-ShareAlike "
|
| 42 |
+
"3.0 Unported License. To view a copy of this license, visit "
|
| 43 |
+
"http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
|
| 44 |
+
"Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Source: https://meta.wikimedia.org/wiki/List_of_Wikipedias
|
| 48 |
+
# Number: 326 = 339 - 13 (retrieved: 2023-11-17)
|
| 49 |
+
WIKIPEDIA_LANGUAGES = [
|
| 50 |
+
"ab",
|
| 51 |
+
"ace",
|
| 52 |
+
"ady",
|
| 53 |
+
"af",
|
| 54 |
+
"als",
|
| 55 |
+
"alt",
|
| 56 |
+
"am",
|
| 57 |
+
"ami",
|
| 58 |
+
"an",
|
| 59 |
+
"ang",
|
| 60 |
+
"anp",
|
| 61 |
+
"ar",
|
| 62 |
+
"arc",
|
| 63 |
+
"ary",
|
| 64 |
+
"arz",
|
| 65 |
+
"as",
|
| 66 |
+
"ast",
|
| 67 |
+
"atj",
|
| 68 |
+
"av",
|
| 69 |
+
"avk",
|
| 70 |
+
"awa",
|
| 71 |
+
"ay",
|
| 72 |
+
"az",
|
| 73 |
+
"azb",
|
| 74 |
+
"ba",
|
| 75 |
+
"ban",
|
| 76 |
+
"bar",
|
| 77 |
+
"bat-smg",
|
| 78 |
+
"bbc",
|
| 79 |
+
"bcl",
|
| 80 |
+
"be",
|
| 81 |
+
"be-tarask",
|
| 82 |
+
"bg",
|
| 83 |
+
"bh",
|
| 84 |
+
"bi",
|
| 85 |
+
"bjn",
|
| 86 |
+
"blk",
|
| 87 |
+
"bm",
|
| 88 |
+
"bn",
|
| 89 |
+
"bo",
|
| 90 |
+
"bpy",
|
| 91 |
+
"br",
|
| 92 |
+
"bs",
|
| 93 |
+
"bug",
|
| 94 |
+
"bxr",
|
| 95 |
+
"ca",
|
| 96 |
+
"cbk-zam",
|
| 97 |
+
"cdo",
|
| 98 |
+
"ce",
|
| 99 |
+
"ceb",
|
| 100 |
+
"ch",
|
| 101 |
+
"chr",
|
| 102 |
+
"chy",
|
| 103 |
+
"ckb",
|
| 104 |
+
"co",
|
| 105 |
+
"cr",
|
| 106 |
+
"crh",
|
| 107 |
+
"cs",
|
| 108 |
+
"csb",
|
| 109 |
+
"cu",
|
| 110 |
+
"cv",
|
| 111 |
+
"cy",
|
| 112 |
+
"da",
|
| 113 |
+
"dag",
|
| 114 |
+
"de",
|
| 115 |
+
"dga",
|
| 116 |
+
"din",
|
| 117 |
+
"diq",
|
| 118 |
+
"dsb",
|
| 119 |
+
"dty",
|
| 120 |
+
"dv",
|
| 121 |
+
"dz",
|
| 122 |
+
"ee",
|
| 123 |
+
"el",
|
| 124 |
+
"eml",
|
| 125 |
+
"en",
|
| 126 |
+
"eo",
|
| 127 |
+
"es",
|
| 128 |
+
"et",
|
| 129 |
+
"eu",
|
| 130 |
+
"ext",
|
| 131 |
+
"fa",
|
| 132 |
+
"fat",
|
| 133 |
+
"ff",
|
| 134 |
+
"fi",
|
| 135 |
+
"fiu-vro",
|
| 136 |
+
"fj",
|
| 137 |
+
"fo",
|
| 138 |
+
"fon",
|
| 139 |
+
"fr",
|
| 140 |
+
"frp",
|
| 141 |
+
"frr",
|
| 142 |
+
"fur",
|
| 143 |
+
"fy",
|
| 144 |
+
"ga",
|
| 145 |
+
"gag",
|
| 146 |
+
"gan",
|
| 147 |
+
"gcr",
|
| 148 |
+
"gd",
|
| 149 |
+
"gl",
|
| 150 |
+
"glk",
|
| 151 |
+
"gn",
|
| 152 |
+
"gom",
|
| 153 |
+
"gor",
|
| 154 |
+
"got",
|
| 155 |
+
"gpe",
|
| 156 |
+
"gu",
|
| 157 |
+
"guc",
|
| 158 |
+
"gur",
|
| 159 |
+
"guw",
|
| 160 |
+
"gv",
|
| 161 |
+
"ha",
|
| 162 |
+
"hak",
|
| 163 |
+
"haw",
|
| 164 |
+
"he",
|
| 165 |
+
"hi",
|
| 166 |
+
"hif",
|
| 167 |
+
"hr",
|
| 168 |
+
"hsb",
|
| 169 |
+
"ht",
|
| 170 |
+
"hu",
|
| 171 |
+
"hy",
|
| 172 |
+
"hyw",
|
| 173 |
+
"ia",
|
| 174 |
+
"id",
|
| 175 |
+
"ie",
|
| 176 |
+
"ig",
|
| 177 |
+
"ik",
|
| 178 |
+
"ilo",
|
| 179 |
+
"inh",
|
| 180 |
+
"io",
|
| 181 |
+
"is",
|
| 182 |
+
"it",
|
| 183 |
+
"iu",
|
| 184 |
+
"ja",
|
| 185 |
+
"jam",
|
| 186 |
+
"jbo",
|
| 187 |
+
"jv",
|
| 188 |
+
"ka",
|
| 189 |
+
"kaa",
|
| 190 |
+
"kab",
|
| 191 |
+
"kbd",
|
| 192 |
+
"kbp",
|
| 193 |
+
"kcg",
|
| 194 |
+
"kg",
|
| 195 |
+
"ki",
|
| 196 |
+
"kk",
|
| 197 |
+
"kl",
|
| 198 |
+
"km",
|
| 199 |
+
"kn",
|
| 200 |
+
"ko",
|
| 201 |
+
"koi",
|
| 202 |
+
"krc",
|
| 203 |
+
"ks",
|
| 204 |
+
"ksh",
|
| 205 |
+
"ku",
|
| 206 |
+
"kv",
|
| 207 |
+
"kw",
|
| 208 |
+
"ky",
|
| 209 |
+
"la",
|
| 210 |
+
"lad",
|
| 211 |
+
"lb",
|
| 212 |
+
"lbe",
|
| 213 |
+
"lez",
|
| 214 |
+
"lfn",
|
| 215 |
+
"lg",
|
| 216 |
+
"li",
|
| 217 |
+
"lij",
|
| 218 |
+
"lld",
|
| 219 |
+
"lmo",
|
| 220 |
+
"ln",
|
| 221 |
+
"lo",
|
| 222 |
+
"lt",
|
| 223 |
+
"ltg",
|
| 224 |
+
"lv",
|
| 225 |
+
"mad",
|
| 226 |
+
"mai",
|
| 227 |
+
"map-bms",
|
| 228 |
+
"mdf",
|
| 229 |
+
"mg",
|
| 230 |
+
"mhr",
|
| 231 |
+
"mi",
|
| 232 |
+
"min",
|
| 233 |
+
"mk",
|
| 234 |
+
"ml",
|
| 235 |
+
"mn",
|
| 236 |
+
"mni",
|
| 237 |
+
"mnw",
|
| 238 |
+
"mr",
|
| 239 |
+
"mrj",
|
| 240 |
+
"ms",
|
| 241 |
+
"mt",
|
| 242 |
+
"mwl",
|
| 243 |
+
"my",
|
| 244 |
+
"myv",
|
| 245 |
+
"mzn",
|
| 246 |
+
"nah",
|
| 247 |
+
"nap",
|
| 248 |
+
"nds",
|
| 249 |
+
"nds-nl",
|
| 250 |
+
"ne",
|
| 251 |
+
"new",
|
| 252 |
+
"nia",
|
| 253 |
+
"nl",
|
| 254 |
+
"nn",
|
| 255 |
+
"no",
|
| 256 |
+
"nov",
|
| 257 |
+
"nqo",
|
| 258 |
+
"nrm",
|
| 259 |
+
"nso",
|
| 260 |
+
"nv",
|
| 261 |
+
"ny",
|
| 262 |
+
"oc",
|
| 263 |
+
"olo",
|
| 264 |
+
"om",
|
| 265 |
+
"or",
|
| 266 |
+
"os",
|
| 267 |
+
"pa",
|
| 268 |
+
"pag",
|
| 269 |
+
"pam",
|
| 270 |
+
"pap",
|
| 271 |
+
"pcd",
|
| 272 |
+
"pcm",
|
| 273 |
+
"pdc",
|
| 274 |
+
"pfl",
|
| 275 |
+
"pi",
|
| 276 |
+
"pih",
|
| 277 |
+
"pl",
|
| 278 |
+
"pms",
|
| 279 |
+
"pnb",
|
| 280 |
+
"pnt",
|
| 281 |
+
"ps",
|
| 282 |
+
"pt",
|
| 283 |
+
"pwn",
|
| 284 |
+
"qu",
|
| 285 |
+
"rm",
|
| 286 |
+
"rmy",
|
| 287 |
+
"rn",
|
| 288 |
+
"ro",
|
| 289 |
+
"roa-rup",
|
| 290 |
+
"roa-tara",
|
| 291 |
+
"ru",
|
| 292 |
+
"rue",
|
| 293 |
+
"rw",
|
| 294 |
+
"sa",
|
| 295 |
+
"sah",
|
| 296 |
+
"sat",
|
| 297 |
+
"sc",
|
| 298 |
+
"scn",
|
| 299 |
+
"sco",
|
| 300 |
+
"sd",
|
| 301 |
+
"se",
|
| 302 |
+
"sg",
|
| 303 |
+
"sh",
|
| 304 |
+
"shi",
|
| 305 |
+
"shn",
|
| 306 |
+
"si",
|
| 307 |
+
"simple",
|
| 308 |
+
"sk",
|
| 309 |
+
"skr",
|
| 310 |
+
"sl",
|
| 311 |
+
"sm",
|
| 312 |
+
"smn",
|
| 313 |
+
"sn",
|
| 314 |
+
"so",
|
| 315 |
+
"sq",
|
| 316 |
+
"sr",
|
| 317 |
+
"srn",
|
| 318 |
+
"ss",
|
| 319 |
+
"st",
|
| 320 |
+
"stq",
|
| 321 |
+
"su",
|
| 322 |
+
"sv",
|
| 323 |
+
"sw",
|
| 324 |
+
"szl",
|
| 325 |
+
"szy",
|
| 326 |
+
"ta",
|
| 327 |
+
"tay",
|
| 328 |
+
"tcy",
|
| 329 |
+
"te",
|
| 330 |
+
"tet",
|
| 331 |
+
"tg",
|
| 332 |
+
"th",
|
| 333 |
+
"ti",
|
| 334 |
+
"tk",
|
| 335 |
+
"tl",
|
| 336 |
+
"tly",
|
| 337 |
+
"tn",
|
| 338 |
+
"to",
|
| 339 |
+
"tpi",
|
| 340 |
+
"tr",
|
| 341 |
+
"trv",
|
| 342 |
+
"ts",
|
| 343 |
+
"tt",
|
| 344 |
+
"tum",
|
| 345 |
+
"tw",
|
| 346 |
+
"ty",
|
| 347 |
+
"tyv",
|
| 348 |
+
"udm",
|
| 349 |
+
"ug",
|
| 350 |
+
"uk",
|
| 351 |
+
"ur",
|
| 352 |
+
"uz",
|
| 353 |
+
"ve",
|
| 354 |
+
"vec",
|
| 355 |
+
"vep",
|
| 356 |
+
"vi",
|
| 357 |
+
"vls",
|
| 358 |
+
"vo",
|
| 359 |
+
"wa",
|
| 360 |
+
"war",
|
| 361 |
+
"wo",
|
| 362 |
+
"wuu",
|
| 363 |
+
"xal",
|
| 364 |
+
"xh",
|
| 365 |
+
"xmf",
|
| 366 |
+
"yi",
|
| 367 |
+
"yo",
|
| 368 |
+
"za",
|
| 369 |
+
"zea",
|
| 370 |
+
"zgh",
|
| 371 |
+
"zh",
|
| 372 |
+
"zh-classical",
|
| 373 |
+
"zh-min-nan",
|
| 374 |
+
"zh-yue",
|
| 375 |
+
"zu",
|
| 376 |
+
]
|
| 377 |
+
|
| 378 |
+
# Source: for each Wikipedia language code (example shown for "ab"), aliases for namespaces -2 and 6 accessed via this API call:
|
| 379 |
+
# https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2
|
| 380 |
+
# Retrieved: 2023-11-17
|
| 381 |
+
MEDIA_ALIASES = {
|
| 382 |
+
"ab": ["Амедиа", "Афаил", "Изображение", "Медиа", "Файл"],
|
| 383 |
+
"ace": ["Alat", "Berkas", "Beureukaih", "Gambar"],
|
| 384 |
+
"ady": ["Медиа"],
|
| 385 |
+
"af": ["Beeld", "Lêer"],
|
| 386 |
+
"als": ["Bild", "Datei", "Medium"],
|
| 387 |
+
"alt": ["Изображение", "Медиа", "Файл"],
|
| 388 |
+
"am": ["ስዕል", "ፋይል"],
|
| 389 |
+
"ami": ["Faylo", "Mitiya", "图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 390 |
+
"an": ["Imachen", "Imagen"],
|
| 391 |
+
"ang": ["Biliþ", "Ymele"],
|
| 392 |
+
"anp": ["फाईल", "मीडिया"],
|
| 393 |
+
"ar": ["صورة", "ملف", "ميديا", "وسائط"],
|
| 394 |
+
"arc": ["ܠܦܦܐ", "ܡܝܕܝܐ"],
|
| 395 |
+
"ary": ["صورة", "فيشي", "ملف", "ميديا", "وسائط"],
|
| 396 |
+
"arz": ["صورة", "ملف", "ميديا", "وسائط"],
|
| 397 |
+
"as": ["चित्र", "চিত্র", "চিত্ৰ", "মাধ্যম"],
|
| 398 |
+
"ast": ["Archivu", "Ficheru", "Imagen", "Imaxe", "Imaxen", "Medios"],
|
| 399 |
+
"atj": ["Natisinahikaniwoc", "Tipatcimoctakewin"],
|
| 400 |
+
"av": ["Изображение", "Медиа", "Файл"],
|
| 401 |
+
"avk": ["Ewava", "Imagen", "Iyeltak", "Kanaca", "Mamind", "Изображение"],
|
| 402 |
+
"awa": ["फाइल", "मीडिया"],
|
| 403 |
+
"ay": ["Archivo", "Imagen", "Medio"],
|
| 404 |
+
"az": ["Fayl", "Mediya", "Şəkil"],
|
| 405 |
+
"azb": ["تصویر", "رسانه", "رسانهای", "فایل", "مدیا"],
|
| 406 |
+
"ba": ["Изображение", "Медиа", "Рәсем", "Файл"],
|
| 407 |
+
"ban": ["Berkas", "Gambar", "Média"],
|
| 408 |
+
"bar": ["Bild", "Datei", "Medium"],
|
| 409 |
+
"bat-smg": ["Abruozdielis", "Medėjė", "Vaizdas"],
|
| 410 |
+
"bbc": ["Ugasan"],
|
| 411 |
+
"bcl": ["Ladawan", "Medio"],
|
| 412 |
+
"be": ["Выява", "Мультымедыя", "Файл"],
|
| 413 |
+
"be-tarask": ["Выява", "Мэдыя", "Файл"],
|
| 414 |
+
"bg": ["Картинка", "Медия", "Файл"],
|
| 415 |
+
"bh": ["चित्र", "मीडिया"],
|
| 416 |
+
"bjn": ["Barakas", "Berkas", "Gambar"],
|
| 417 |
+
"blk": ["ဖုဲင်", "မီဒီယာ"],
|
| 418 |
+
"bm": ["Fichier", "Média"],
|
| 419 |
+
"bn": ["চিত্র", "মিডিয়া"],
|
| 420 |
+
"bpy": ["ছবি", "মিডিয়া"],
|
| 421 |
+
"br": ["Restr", "Skeudenn"],
|
| 422 |
+
"bs": ["Datoteka", "Medija", "Mediji", "Slika"],
|
| 423 |
+
"bug": ["Berkas", "Gambar"],
|
| 424 |
+
"bxr": ["Изображение", "Меди", "Файл"],
|
| 425 |
+
"ca": ["Fitxer", "Imatge"],
|
| 426 |
+
"cbk-zam": ["Archivo", "Imagen", "Medio"],
|
| 427 |
+
"cdo": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 428 |
+
"ce": ["Изображение", "Медиа", "Медйа", "Сурт", "Файл", "Хlум"],
|
| 429 |
+
"ceb": ["Imahen", "Medya", "Payl"],
|
| 430 |
+
"ch": ["Litratu"],
|
| 431 |
+
"ckb": ["میدیا", "پەڕگە"],
|
| 432 |
+
"co": ["Immagine"],
|
| 433 |
+
"crh": ["Fayl", "Resim", "Медиа", "Ресим", "Файл"],
|
| 434 |
+
"cs": ["Média", "Obrázok", "Soubor"],
|
| 435 |
+
"csb": ["Grafika", "Òbrôzk"],
|
| 436 |
+
"cu": ["Ви́дъ", "Видъ", "Дѣло", "Срѣдьства"],
|
| 437 |
+
"cv": ["Изображение", "Медиа", "Ӳкерчĕк"],
|
| 438 |
+
"cy": ["Delwedd"],
|
| 439 |
+
"da": ["Billede", "Fil"],
|
| 440 |
+
"dag": ["Lahabali kɔligu", "Miidiya"],
|
| 441 |
+
"de": ["Bild", "Datei", "Medium"],
|
| 442 |
+
"dga": ["Duoro bimbu zie", "Duoro kɔre"],
|
| 443 |
+
"din": ["Apamduööt", "Ciɛl"],
|
| 444 |
+
"diq": ["Dosya", "Medya"],
|
| 445 |
+
"dsb": ["Bild", "Dataja", "Medija", "Wobraz"],
|
| 446 |
+
"dty": ["चित्र", "मिडिया"],
|
| 447 |
+
"dv": ["މީޑިއާ", "ފައިލު", "ފައިލް"],
|
| 448 |
+
"el": ["Αρχείο", "Εικόνα", "Μέσο", "Μέσον"],
|
| 449 |
+
"eml": ["Immagine"],
|
| 450 |
+
"eo": ["Aŭdvidaĵo", "Dosiero"],
|
| 451 |
+
"es": ["Archivo", "Imagen", "Medio"],
|
| 452 |
+
"et": ["Fail", "Meedia", "Pilt"],
|
| 453 |
+
"eu": ["Fitxategi", "Irudi"],
|
| 454 |
+
"ext": ["Archivu", "Imagen", "Mediu"],
|
| 455 |
+
"fa": ["تصویر", "رسانه", "رسانهای", "مدیا", "پرونده"],
|
| 456 |
+
"fat": ["Fael"],
|
| 457 |
+
"ff": ["Fichier", "Média"],
|
| 458 |
+
"fi": ["Kuva", "Tiedosto"],
|
| 459 |
+
"fiu-vro": ["Meediä", "Pilt"],
|
| 460 |
+
"fo": ["Miðil", "Mynd"],
|
| 461 |
+
"fon": ["Wékpo", "Yɛwliɖonuji"],
|
| 462 |
+
"fr": ["Fichier", "Média"],
|
| 463 |
+
"frp": ["Fichiér", "Mèdia", "Émâge"],
|
| 464 |
+
"frr": ["Bild", "Datei", "Medium"],
|
| 465 |
+
"fur": ["Figure", "Immagine"],
|
| 466 |
+
"fy": ["Ofbyld"],
|
| 467 |
+
"ga": ["Meán", "Íomhá"],
|
| 468 |
+
"gag": ["Dosya", "Dosye", "Mediya", "Medya", "Resim"],
|
| 469 |
+
"gan": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "文檔", "档案", "��案"],
|
| 470 |
+
"gcr": ["Fiché", "Médja"],
|
| 471 |
+
"gd": ["Faidhle", "Meadhan"],
|
| 472 |
+
"gl": ["Arquivo", "Ficheiro", "Imagem", "Imaxe"],
|
| 473 |
+
"glk": ["تصویر", "رسانه", "رسانهای", "فاىل", "مديا", "پرونده"],
|
| 474 |
+
"gn": ["Imagen", "Medio", "Ta'ãnga"],
|
| 475 |
+
"gom": ["फायल", "माध्यम", "मिडिया"],
|
| 476 |
+
"gor": ["Berkas", "Gambar"],
|
| 477 |
+
"got": ["𐍆𐌴𐌹𐌻𐌰"],
|
| 478 |
+
"gu": ["ચિત્ર", "દ્રશ્ય-શ્રાવ્ય (મિડિયા)"],
|
| 479 |
+
"guc": ["Anaajaalaa", "Ayaakuwapülee", "Imagen"],
|
| 480 |
+
"gur": ["Faali", "Miidiya"],
|
| 481 |
+
"guw": ["Wepo"],
|
| 482 |
+
"gv": ["Coadan", "Meanyn"],
|
| 483 |
+
"hak": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 484 |
+
"haw": ["Kiʻi", "Pāpaho", "Waihona"],
|
| 485 |
+
"he": ["מדיה", "קו", "קובץ", "תמונה"],
|
| 486 |
+
"hi": ["चित्र", "मीडिया"],
|
| 487 |
+
"hif": ["file", "saadhan"],
|
| 488 |
+
"hr": ["DT", "Datoteka", "Mediji", "Slika"],
|
| 489 |
+
"hsb": ["Bild", "Dataja", "Wobraz"],
|
| 490 |
+
"ht": ["Fichye", "Imaj", "Medya"],
|
| 491 |
+
"hu": ["Fájl", "Kép", "Média"],
|
| 492 |
+
"hy": ["Մեդիա", "Պատկեր"],
|
| 493 |
+
"hyw": ["Մեդիա", "Պատկեր"],
|
| 494 |
+
"ia": ["Imagine", "Multimedia"],
|
| 495 |
+
"id": ["Berkas", "Gambar"],
|
| 496 |
+
"ig": ["Midia", "Nká", "Usòrò", "Ákwúkwó orünotu"],
|
| 497 |
+
"ilo": ["Midia", "Papeles"],
|
| 498 |
+
"inh": ["Изображение", "Медиа", "Файл"],
|
| 499 |
+
"io": ["Arkivo", "Imajo"],
|
| 500 |
+
"is": ["Miðill", "Mynd"],
|
| 501 |
+
"it": ["Immagine"],
|
| 502 |
+
"ja": ["ファイル", "メディア", "画像"],
|
| 503 |
+
"jbo": ["datnyvei", "velsku"],
|
| 504 |
+
"jv": ["Barkas", "Gambar", "Medhia", "Médhia"],
|
| 505 |
+
"ka": ["მედია", "სურათი", "ფაილი"],
|
| 506 |
+
"kaa": ["Fayl", "Su'wret", "Swret", "Taspa", "Сурет", "Таспа", "تاسپا", "سۋرەت"],
|
| 507 |
+
"kab": ["Tugna"],
|
| 508 |
+
"kbd": ["Медиа", "Файл"],
|
| 509 |
+
"kbp": ["Fichier", "Média"],
|
| 510 |
+
"kcg": ["Fail"],
|
| 511 |
+
"kg": ["Fisye"],
|
| 512 |
+
"kk": ["Swret", "Taspa", "Сурет", "Таспа", "تاسپا", "سۋرەت"],
|
| 513 |
+
"kl": ["Billede", "Fiileq", "Fil"],
|
| 514 |
+
"km": ["មីឌា", "មេឌា", "រូបភាព", "ឯកសារ"],
|
| 515 |
+
"kn": ["ಚಿತ್ರ", "ಮೀಡಿಯ"],
|
| 516 |
+
"ko": ["그림", "미디어", "파일"],
|
| 517 |
+
"koi": ["Изображение", "Медиа", "Файл"],
|
| 518 |
+
"krc": ["Изображение", "Медиа", "Файл"],
|
| 519 |
+
"ks": ["فَیِل", "میڈیا"],
|
| 520 |
+
"ksh": ["Beld", "Belld", "Bild", "Datei", "Medie", "Medium", "Meedije", "Meedijum"],
|
| 521 |
+
"ku": ["Medya", "Wêne", "میدیا", "پەڕگە"],
|
| 522 |
+
"kv": ["Изображение", "Медиа", "Файл"],
|
| 523 |
+
"kw": ["Restren"],
|
| 524 |
+
"ky": ["Медиа", "Файл"],
|
| 525 |
+
"la": ["Fasciculus", "Imago"],
|
| 526 |
+
"lad": ["Archivo", "Dossia", "Dosya", "Imagen", "Meddia", "Medya"],
|
| 527 |
+
"lb": ["Bild", "Fichier"],
|
| 528 |
+
"lbe": ["Изображение", "Медиа", "Сурат"],
|
| 529 |
+
"lez": ["Mediya", "Şəkil", "Изображение", "Медиа", "Файл"],
|
| 530 |
+
"lfn": ["Fix"],
|
| 531 |
+
"li": ["Aafbeilding", "Afbeelding", "Plaetje"],
|
| 532 |
+
"lij": ["Immaggine", "Immagine"],
|
| 533 |
+
"lld": ["Immagine"],
|
| 534 |
+
"lmo": ["Archivi", "Immagine", "Imàjine"],
|
| 535 |
+
"ln": ["Fichier", "Média"],
|
| 536 |
+
"lo": ["ສື່", "ສື່ອ", "ຮູບ"],
|
| 537 |
+
"lt": ["Medija", "Vaizdas"],
|
| 538 |
+
"ltg": ["Fails", "Medeja"],
|
| 539 |
+
"lv": ["Attēls"],
|
| 540 |
+
"mad": ["Bhengkek", "Gambar", "Mèḍia"],
|
| 541 |
+
"mai": ["फाइल", "मेडिया"],
|
| 542 |
+
"map-bms": ["Barkas", "Gambar", "Medhia", "Médhia"],
|
| 543 |
+
"mdf": ["Изображение", "Медиа", "Няйф"],
|
| 544 |
+
"mg": ["Média", "Rakitra", "Sary"],
|
| 545 |
+
"mhr": ["Изображение", "Медиа", "Файл"],
|
| 546 |
+
"min": ["Berkas", "Gambar"],
|
| 547 |
+
"mk": ["Медиум", "Медија", "Податотека", "Слика"],
|
| 548 |
+
"ml": ["ചി", "ചിത്രം", "പ്ര", "പ്രമാണം", "മീഡിയ"],
|
| 549 |
+
"mn": ["Зураг", "Медиа", "Файл"],
|
| 550 |
+
"mni": ["ꯃꯦꯗꯤꯌꯥ", "ꯐꯥꯏꯜ"],
|
| 551 |
+
"mnw": ["မဳဒဳယာ", "ဝှာင်"],
|
| 552 |
+
"mr": ["चित्र", "मिडिया"],
|
| 553 |
+
"mrj": ["Изображение", "Медиа", "Файл"],
|
| 554 |
+
"ms": ["Fail", "Imej"],
|
| 555 |
+
"mt": ["Medja", "Midja", "Stampa"],
|
| 556 |
+
"mwl": ["Arquivo", "Fexeiro", "Ficheiro", "Imagem", "Multimédia"],
|
| 557 |
+
"my": ["ဖိုင်", "မီဒီယာ"],
|
| 558 |
+
"myv": ["Артовкс", "Изображение", "Медия"],
|
| 559 |
+
"mzn": ["تصویر", "رسانه", "رسانهای", "مدیا", "مهدیا", "پرونده"],
|
| 560 |
+
"nah": ["Imagen", "Mēdiatl", "Īxiptli"],
|
| 561 |
+
"nap": ["Fiùra", "Immagine"],
|
| 562 |
+
"nds": ["Bild", "Datei"],
|
| 563 |
+
"nds-nl": ["Afbeelding", "Bestaand", "Ofbeelding"],
|
| 564 |
+
"ne": ["चित्र", "मीडिया"],
|
| 565 |
+
"new": ["किपा", "माध्यम"],
|
| 566 |
+
"nia": ["Berkas", "Gambar"],
|
| 567 |
+
"nl": ["Afbeelding", "Bestand"],
|
| 568 |
+
"nn": ["Bilde", "Fil", "Filpeikar"],
|
| 569 |
+
"no": ["Bilde", "Fil", "Medium"],
|
| 570 |
+
"nqo": ["ߞߐߕߐ߮", "ߟߊߛߋߢߊߥߙߍ"],
|
| 571 |
+
"nrm": ["Fichier", "Média"],
|
| 572 |
+
"nso": ["Seswantšho"],
|
| 573 |
+
"nv": ["Eʼelyaaígíí"],
|
| 574 |
+
"oc": ["Fichièr", "Imatge", "Mèdia"],
|
| 575 |
+
"olo": ["Failu", "Kuva", "Medii"],
|
| 576 |
+
"or": ["ଫାଇଲ", "ମାଧ୍ୟମ"],
|
| 577 |
+
"os": ["Изображение", "Медиа", "Ныв", "Файл"],
|
| 578 |
+
"pa": ["ਤਸਵੀਰ", "ਮੀਡੀਆ"],
|
| 579 |
+
"pcd": ["Fichier", "Média"],
|
| 580 |
+
"pdc": ["Bild", "Datei", "Feil", "Medium"],
|
| 581 |
+
"pfl": ["Bild", "Dadai", "Datei", "Medium"],
|
| 582 |
+
"pi": ["पटिमा", "मीडिया"],
|
| 583 |
+
"pl": ["Grafika", "Plik"],
|
| 584 |
+
"pms": ["Figura", "Immagine"],
|
| 585 |
+
"pnb": ["تصویر", "فائل", "میڈیا"],
|
| 586 |
+
"pnt": ["Αρχείον", "Εικόνα", "Εικόναν", "Μέσον"],
|
| 587 |
+
"ps": ["انځور", "دوتنه", "رسنۍ"],
|
| 588 |
+
"pt": ["Arquivo", "Ficheiro", "Imagem", "Multimédia"],
|
| 589 |
+
"pwn": [
|
| 590 |
+
"mitiya",
|
| 591 |
+
"sineqetj a vecik",
|
| 592 |
+
"图像",
|
| 593 |
+
"图片",
|
| 594 |
+
"圖像",
|
| 595 |
+
"圖片",
|
| 596 |
+
"媒体",
|
| 597 |
+
"媒体文件",
|
| 598 |
+
"媒体档案",
|
| 599 |
+
"媒體",
|
| 600 |
+
"媒體文件",
|
| 601 |
+
"媒體檔案",
|
| 602 |
+
"文件",
|
| 603 |
+
"档案",
|
| 604 |
+
"檔案",
|
| 605 |
+
],
|
| 606 |
+
"qu": ["Imagen", "Midya", "Rikcha"],
|
| 607 |
+
"rm": ["Bild", "Datoteca", "Multimedia"],
|
| 608 |
+
"rmy": ["Chitro", "Fişier", "Imagine", "Mediya"],
|
| 609 |
+
"rn": ["Dosiye"],
|
| 610 |
+
"ro": ["Fişier", "Fișier", "Imagine"],
|
| 611 |
+
"roa-rup": ["Fişier", "Fișier", "Imagine"],
|
| 612 |
+
"roa-tara": ["Immagine"],
|
| 613 |
+
"ru": ["Изображение", "Медиа", "Файл"],
|
| 614 |
+
"rue": ["Зображення", "Изображение", "Медиа", "Медіа", "Файл"],
|
| 615 |
+
"rw": ["Dosiye", "Itangazamakuru"],
|
| 616 |
+
"sa": ["चित्रं", "चित्रम्", "माध्यम", "माध्यमम्", "सञ्चिका"],
|
| 617 |
+
"sah": ["Билэ", "Изображение", "Миэдьийэ", "Ойуу"],
|
| 618 |
+
"sat": ["ᱢᱤᱰᱤᱭᱟ", "ᱨᱮᱫ"],
|
| 619 |
+
"sc": ["Immàgini"],
|
| 620 |
+
"scn": ["Immagine", "Mmàggini", "Mèdia"],
|
| 621 |
+
"sd": ["ذريعات", "عڪس", "فائل"],
|
| 622 |
+
"se": ["Bilde", "Fiila", "Kuva"],
|
| 623 |
+
"sg": ["Fichier", "Média"],
|
| 624 |
+
"sh": ["DT", "Datoteka", "Fotografija", "Medija", "Mediji", "Slika", "ДТ", "Датотека", "Медиј"],
|
| 625 |
+
"shi": ["Afaylu", "Midya"],
|
| 626 |
+
"shn": ["သိုဝ်ႇၶၢဝ်ႇ", "ၾၢႆႇ"],
|
| 627 |
+
"si": ["ගොනුව", "මාධ්යය", "රූපය"],
|
| 628 |
+
"sk": ["Médiá", "Obrázok", "Súbor"],
|
| 629 |
+
"skr": ["فائل", "میڈیا"],
|
| 630 |
+
"sl": ["Datoteka", "Slika"],
|
| 631 |
+
"smn": ["Kuva", "Tiätuvuárkká"],
|
| 632 |
+
"sq": ["Figura", "Skeda"],
|
| 633 |
+
"sr": [
|
| 634 |
+
"Datoteka",
|
| 635 |
+
"Fotografija",
|
| 636 |
+
"Medij",
|
| 637 |
+
"Medija",
|
| 638 |
+
"Slika",
|
| 639 |
+
"Датотека",
|
| 640 |
+
"Медиј",
|
| 641 |
+
"Медија",
|
| 642 |
+
"Слика",
|
| 643 |
+
"Фотографија",
|
| 644 |
+
],
|
| 645 |
+
"srn": ["Afbeelding", "Gefre"],
|
| 646 |
+
"stq": ["Bielde", "Bild"],
|
| 647 |
+
"su": ["Gambar", "Média"],
|
| 648 |
+
"sv": ["Bild", "Fil"],
|
| 649 |
+
"sw": ["Faili", "Picha"],
|
| 650 |
+
"szl": ["Grafika", "Plik"],
|
| 651 |
+
"szy": ["myiti", "tangan", "图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 652 |
+
"ta": ["ஊடகம்", "படிமம்"],
|
| 653 |
+
"tay": [
|
| 654 |
+
"biru' na zayzyuwaw",
|
| 655 |
+
"biru’ na zayzyuwaw",
|
| 656 |
+
"media",
|
| 657 |
+
"图像",
|
| 658 |
+
"图片",
|
| 659 |
+
"圖像",
|
| 660 |
+
"圖片",
|
| 661 |
+
"媒体",
|
| 662 |
+
"媒体文件",
|
| 663 |
+
"媒体档案",
|
| 664 |
+
"媒體",
|
| 665 |
+
"媒體文件",
|
| 666 |
+
"媒體檔案",
|
| 667 |
+
"文件",
|
| 668 |
+
"档案",
|
| 669 |
+
"檔案",
|
| 670 |
+
],
|
| 671 |
+
"tcy": ["ಫೈಲ್", "ಮಾದ್ಯಮೊ"],
|
| 672 |
+
"te": ["దస్త్రం", "ఫైలు", "బొమ్మ", "మీడియా"],
|
| 673 |
+
"tet": ["Arquivo", "Imagem", "Imajen"],
|
| 674 |
+
"tg": ["Акс", "Медиа"],
|
| 675 |
+
"th": ["ภาพ", "สื่อ", "ไฟล์"],
|
| 676 |
+
"ti": ["ሜድያ", "ፋይል"],
|
| 677 |
+
"tk": ["Faýl"],
|
| 678 |
+
"tl": ["Midya", "Talaksan"],
|
| 679 |
+
"tly": ["Fajl", "Medja"],
|
| 680 |
+
"tn": ["Pego", "Setshwantsho"],
|
| 681 |
+
"tpi": ["Fail"],
|
| 682 |
+
"tr": ["Dosya", "Medya", "Ortam", "Resim"],
|
| 683 |
+
"trv": [
|
| 684 |
+
"Meyti",
|
| 685 |
+
"Patas bntasan",
|
| 686 |
+
"图像",
|
| 687 |
+
"图片",
|
| 688 |
+
"圖像",
|
| 689 |
+
"圖片",
|
| 690 |
+
"媒体",
|
| 691 |
+
"媒体文件",
|
| 692 |
+
"媒体档案",
|
| 693 |
+
"媒體",
|
| 694 |
+
"媒體文件",
|
| 695 |
+
"媒體檔案",
|
| 696 |
+
"文件",
|
| 697 |
+
"档案",
|
| 698 |
+
"檔案",
|
| 699 |
+
],
|
| 700 |
+
"tt": ["Räsem", "Изображение", "Медиа", "Рәсем", "Файл"],
|
| 701 |
+
"ty": ["Fichier", "Média"],
|
| 702 |
+
"tyv": ["Изображение", "Медиа", "Файл"],
|
| 703 |
+
"udm": ["Изображение", "Медиа", "Суред", "Файл"],
|
| 704 |
+
"ug": ["ھۆججەت", "ۋاسىتە"],
|
| 705 |
+
"uk": ["Зображення", "Изображение", "Медиа", "Медіа", "Файл"],
|
| 706 |
+
"ur": ["تصویر", "زریعہ", "فا��ل", "ملف", "میڈیا", "وسیط"],
|
| 707 |
+
"uz": ["Fayl", "Mediya", "Tasvir"],
|
| 708 |
+
"vec": ["Immagine", "Imàjine", "Mèdia"],
|
| 709 |
+
"vep": ["Fail", "Pilt"],
|
| 710 |
+
"vi": ["Hình", "Phương tiện", "Tập tin"],
|
| 711 |
+
"vls": ["Afbeelding", "Ofbeeldienge"],
|
| 712 |
+
"vo": ["Magod", "Nünamakanäd", "Ragiv"],
|
| 713 |
+
"wa": ["Imådje"],
|
| 714 |
+
"war": ["Fayl", "Medya", "Paypay"],
|
| 715 |
+
"wo": ["Dencukaay", "Xibaarukaay"],
|
| 716 |
+
"wuu": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 717 |
+
"xal": ["Аһар", "Боомг", "Зург", "Изображение"],
|
| 718 |
+
"xmf": ["მედია", "სურათი", "ფაილი"],
|
| 719 |
+
"yi": ["בילד", "טעקע", "מעדיע", "תמונה"],
|
| 720 |
+
"yo": ["Amóhùnmáwòrán", "Fáìlì", "Àwòrán"],
|
| 721 |
+
"za": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 722 |
+
"zea": ["Afbeelding", "Plaetje"],
|
| 723 |
+
"zgh": ["ⴰⴼⴰⵢⵍⵓ", "ⵎⵉⴷⵢⴰ"],
|
| 724 |
+
"zh": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 725 |
+
"zh-classical": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
|
| 726 |
+
"zh-min-nan": [
|
| 727 |
+
"Mûi-thé",
|
| 728 |
+
"tóng-àn",
|
| 729 |
+
"图像",
|
| 730 |
+
"图片",
|
| 731 |
+
"圖像",
|
| 732 |
+
"圖片",
|
| 733 |
+
"媒体",
|
| 734 |
+
"媒体文件",
|
| 735 |
+
"媒体档案",
|
| 736 |
+
"媒體",
|
| 737 |
+
"媒體文件",
|
| 738 |
+
"媒體檔案",
|
| 739 |
+
"文件",
|
| 740 |
+
"档案",
|
| 741 |
+
"檔案",
|
| 742 |
+
],
|
| 743 |
+
"zh-yue": ["图", "图像", "圖", "圖像", "媒体", "媒體", "文件", "档", "档案", "檔", "檔案"],
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
# Source: for each Wikipedia language code (example shown for "ab"), aliases for namespace 14 accessed via this API call:
|
| 747 |
+
# https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2
|
| 748 |
+
# Retrieved: 2023-11-17
|
| 749 |
+
CAT_ALIASES = {
|
| 750 |
+
"ab": ["Акатегориа", "Категория"],
|
| 751 |
+
"ace": ["Kategori", "Kawan"],
|
| 752 |
+
"af": ["Kategorie"],
|
| 753 |
+
"als": ["Kategorie"],
|
| 754 |
+
"alt": ["Категория"],
|
| 755 |
+
"am": ["መደብ"],
|
| 756 |
+
"ami": ["Kasasiwasiw", "分类", "分類"],
|
| 757 |
+
"an": ["Categoría"],
|
| 758 |
+
"ang": ["Flocc"],
|
| 759 |
+
"anp": ["श्रेणी"],
|
| 760 |
+
"ar": ["تصنيف"],
|
| 761 |
+
"arc": ["ܣܕܪܐ"],
|
| 762 |
+
"ary": ["تصنيف"],
|
| 763 |
+
"arz": ["تصنيف"],
|
| 764 |
+
"as": ["CAT", "श्रेणी", "শ্রেণী", "শ্ৰেণী"],
|
| 765 |
+
"ast": ["Categoría"],
|
| 766 |
+
"atj": ["Tipanictawin"],
|
| 767 |
+
"av": ["Категория"],
|
| 768 |
+
"avk": ["Loma"],
|
| 769 |
+
"awa": ["श्रेणी"],
|
| 770 |
+
"ay": ["Categoría"],
|
| 771 |
+
"az": ["Kateqoriya"],
|
| 772 |
+
"azb": ["بؤلمه"],
|
| 773 |
+
"ba": ["Категория", "Төркөм"],
|
| 774 |
+
"ban": ["Kategori"],
|
| 775 |
+
"bar": ["Kategorie"],
|
| 776 |
+
"bat-smg": ["Kategorija", "Kateguorėjė"],
|
| 777 |
+
"bbc": ["Horong"],
|
| 778 |
+
"bcl": ["Kategorya"],
|
| 779 |
+
"be": ["Катэгорыя"],
|
| 780 |
+
"be-tarask": ["Катэгорыя"],
|
| 781 |
+
"bg": ["Категория"],
|
| 782 |
+
"bh": ["श्रेणी"],
|
| 783 |
+
"bjn": ["Kategori", "Tumbung"],
|
| 784 |
+
"blk": ["ကဏ္ဍ"],
|
| 785 |
+
"bm": ["Catégorie"],
|
| 786 |
+
"bn": ["বিষয়শ্রেণী"],
|
| 787 |
+
"bpy": ["থাক"],
|
| 788 |
+
"br": ["Rummad"],
|
| 789 |
+
"bs": ["Kategorija"],
|
| 790 |
+
"bug": ["Kategori"],
|
| 791 |
+
"bxr": ["Категори", "Категория"],
|
| 792 |
+
"ca": ["Categoria"],
|
| 793 |
+
"cbk-zam": ["Categoría"],
|
| 794 |
+
"cdo": ["分类", "分類"],
|
| 795 |
+
"ce": ["Кадегар", "Категори", "Тоба"],
|
| 796 |
+
"ceb": ["Kategoriya"],
|
| 797 |
+
"ch": ["Katigoria"],
|
| 798 |
+
"ckb": ["پ", "پۆل"],
|
| 799 |
+
"co": ["Categoria"],
|
| 800 |
+
"crh": ["Kategoriya", "Категория"],
|
| 801 |
+
"cs": ["Kategorie"],
|
| 802 |
+
"csb": ["Kategòrëjô"],
|
| 803 |
+
"cu": ["Категория", "Катигорїꙗ", "Катигорї"],
|
| 804 |
+
"cv": ["Категори"],
|
| 805 |
+
"cy": ["Categori"],
|
| 806 |
+
"da": ["Kategori"],
|
| 807 |
+
"dag": ["Pubu"],
|
| 808 |
+
"de": ["Kategorie"],
|
| 809 |
+
"dga": ["Gbuli"],
|
| 810 |
+
"din": ["Bekätakthook"],
|
| 811 |
+
"diq": ["Kategori", "Kategoriye"],
|
| 812 |
+
"dsb": ["Kategorija"],
|
| 813 |
+
"dty": ["श्रेणी"],
|
| 814 |
+
"dv": ["ޤިސްމު"],
|
| 815 |
+
"el": ["Κατηγορία"],
|
| 816 |
+
"eml": ["Categoria"],
|
| 817 |
+
"eo": ["Kategorio"],
|
| 818 |
+
"es": ["CAT", "Categoría"],
|
| 819 |
+
"et": ["Kategooria"],
|
| 820 |
+
"eu": ["Kategoria"],
|
| 821 |
+
"ext": ["Categoria", "Categoría"],
|
| 822 |
+
"fa": ["رده"],
|
| 823 |
+
"fat": ["Nkyekyεmu"],
|
| 824 |
+
"ff": ["Catégorie"],
|
| 825 |
+
"fi": ["Luokka"],
|
| 826 |
+
"fiu-vro": ["Katõgooria"],
|
| 827 |
+
"fo": ["Bólkur"],
|
| 828 |
+
"fon": ["Akpaxwé"],
|
| 829 |
+
"fr": ["Catégorie"],
|
| 830 |
+
"frp": ["Catègorie"],
|
| 831 |
+
"frr": ["Kategorie"],
|
| 832 |
+
"fur": ["Categorie"],
|
| 833 |
+
"fy": ["Kategory"],
|
| 834 |
+
"ga": ["Catagóir", "Rang"],
|
| 835 |
+
"gag": ["Kategori", "Kategoriya"],
|
| 836 |
+
"gan": ["分类", "分類"],
|
| 837 |
+
"gcr": ["Katégori"],
|
| 838 |
+
"gd": ["Roinn-seòrsa"],
|
| 839 |
+
"gl": ["Categoría"],
|
| 840 |
+
"glk": ["جرگه", "رده"],
|
| 841 |
+
"gn": ["Ñemohenda"],
|
| 842 |
+
"gom": ["वर्ग", "श्रेणी"],
|
| 843 |
+
"gor": ["Dalala"],
|
| 844 |
+
"got": ["𐌷𐌰𐌽𐍃𐌰"],
|
| 845 |
+
"gu": ["CAT", "શ્રે", "શ્રેણી"],
|
| 846 |
+
"guc": ["Akotchajülee sünülia"],
|
| 847 |
+
"gur": ["Buuri buuri"],
|
| 848 |
+
"guw": ["Adà"],
|
| 849 |
+
"gv": ["Ronney"],
|
| 850 |
+
"hak": ["分类", "分類"],
|
| 851 |
+
"haw": ["Māhele"],
|
| 852 |
+
"he": ["קט", "קטגוריה"],
|
| 853 |
+
"hi": ["श्र", "श्रेणी"],
|
| 854 |
+
"hif": ["vibhag"],
|
| 855 |
+
"hr": ["CT", "KT", "Kategorija"],
|
| 856 |
+
"hsb": ["Kategorija"],
|
| 857 |
+
"ht": ["Kategori"],
|
| 858 |
+
"hu": ["Kategória"],
|
| 859 |
+
"hy": ["Կատեգորիա"],
|
| 860 |
+
"hyw": ["Ստորոգութիւն"],
|
| 861 |
+
"ia": ["Categoria"],
|
| 862 |
+
"id": ["Kategori"],
|
| 863 |
+
"ie": ["Categorie"],
|
| 864 |
+
"ig": ["Ébéonọr", "Òtù"],
|
| 865 |
+
"ilo": ["Kategoria"],
|
| 866 |
+
"inh": ["ОагӀат"],
|
| 867 |
+
"io": ["Kategorio"],
|
| 868 |
+
"is": ["Flokkur"],
|
| 869 |
+
"it": ["CAT", "Categoria"],
|
| 870 |
+
"ja": ["カテゴリ"],
|
| 871 |
+
"jbo": ["klesi"],
|
| 872 |
+
"jv": ["Kategori"],
|
| 873 |
+
"ka": ["კატეგორია"],
|
| 874 |
+
"kaa": ["Kategoriya", "Sanat", "Санат", "سانات"],
|
| 875 |
+
"kab": ["Taggayt"],
|
| 876 |
+
"kbd": ["Категориэ", "Категория"],
|
| 877 |
+
"kbp": ["Catégorie"],
|
| 878 |
+
"kcg": ["Sa"],
|
| 879 |
+
"kg": ["Kalasi"],
|
| 880 |
+
"kk": ["Sanat", "Санат", "سانات"],
|
| 881 |
+
"kl": ["Kategori", "Sumut atassuseq"],
|
| 882 |
+
"km": ["ចំណាត់ក្រុម", "ចំណាត់ថ្នាក់ក្រុម", "ចំនាត់ថ្នាក់ក្រុម"],
|
| 883 |
+
"kn": ["ವರ್ಗ"],
|
| 884 |
+
"ko": ["분류"],
|
| 885 |
+
"koi": ["Категория"],
|
| 886 |
+
"krc": ["Категория"],
|
| 887 |
+
"ks": ["زٲژ"],
|
| 888 |
+
"ksh": ["Kategorie", "Katejori", "Kattejori", "Saachjrop", "Saachjropp", "Saachjrupp", "Sachjrop"],
|
| 889 |
+
"ku": ["Kategorî", "پۆل"],
|
| 890 |
+
"kv": ["Категория"],
|
| 891 |
+
"kw": ["Class", "Klass"],
|
| 892 |
+
"ky": ["Категория"],
|
| 893 |
+
"la": ["Categoria"],
|
| 894 |
+
"lad": ["Categoría", "Kateggoría", "Katēggoría"],
|
| 895 |
+
"lb": ["Kategorie"],
|
| 896 |
+
"lbe": ["Категория"],
|
| 897 |
+
"lez": ["Категория"],
|
| 898 |
+
"lfn": ["Categoria"],
|
| 899 |
+
"li": ["Categorie", "Kategorie"],
|
| 900 |
+
"lij": ["Categoria", "Categorîa"],
|
| 901 |
+
"lld": ["Categoria"],
|
| 902 |
+
"lmo": ["Categoria", "Categuria"],
|
| 903 |
+
"ln": ["Catégorie"],
|
| 904 |
+
"lo": ["ໝວດ"],
|
| 905 |
+
"lt": ["Kategorija"],
|
| 906 |
+
"ltg": ["Kategoreja"],
|
| 907 |
+
"lv": ["Kategorija"],
|
| 908 |
+
"mad": ["Bhângsa"],
|
| 909 |
+
"mai": ["CA", "श्रेणी"],
|
| 910 |
+
"map-bms": ["Kategori"],
|
| 911 |
+
"mdf": ["Категорие", "Категория"],
|
| 912 |
+
"mg": ["Catégorie", "Sokajy"],
|
| 913 |
+
"mhr": ["Категорий", "Категория"],
|
| 914 |
+
"min": ["Kategori"],
|
| 915 |
+
"mk": ["Категорија"],
|
| 916 |
+
"ml": ["വ", "വി", "വിഭാഗം", "വർഗ്ഗം"],
|
| 917 |
+
"mn": ["Ангилал"],
|
| 918 |
+
"mni": ["ꯃꯆꯥꯈꯥꯏꯕ"],
|
| 919 |
+
"mnw": ["ကဏ္ဍ"],
|
| 920 |
+
"mr": ["वर्ग"],
|
| 921 |
+
"mrj": ["Категори", "Категория"],
|
| 922 |
+
"ms": ["Kategori"],
|
| 923 |
+
"mt": ["Kategorija"],
|
| 924 |
+
"mwl": ["Catadorie", "Categoria"],
|
| 925 |
+
"my": ["ကဏ္ဍ"],
|
| 926 |
+
"myv": ["Категория"],
|
| 927 |
+
"mzn": ["رج", "رده"],
|
| 928 |
+
"nah": ["Categoría", "Neneuhcāyōtl"],
|
| 929 |
+
"nap": ["Categoria", "Categurìa"],
|
| 930 |
+
"nds": ["Kategorie"],
|
| 931 |
+
"nds-nl": ["Categorie", "Kategorie", "Kattegerie"],
|
| 932 |
+
"ne": ["श्रेणी"],
|
| 933 |
+
"new": ["पुचः"],
|
| 934 |
+
"nia": ["Kategori"],
|
| 935 |
+
"nl": ["Categorie"],
|
| 936 |
+
"nn": ["Kategori"],
|
| 937 |
+
"no": ["Kategori"],
|
| 938 |
+
"nqo": ["ߦߌߟߡߊ"],
|
| 939 |
+
"nrm": ["Catégorie"],
|
| 940 |
+
"nso": ["Setensele"],
|
| 941 |
+
"nv": ["Tʼááłáhági átʼéego"],
|
| 942 |
+
"oc": ["Categoria"],
|
| 943 |
+
"olo": ["Kategourii"],
|
| 944 |
+
"or": ["ବିଭାଗ", "ଶ୍ରେଣୀ"],
|
| 945 |
+
"os": ["Категори"],
|
| 946 |
+
"pa": ["ਸ਼੍ਰੇਣੀ"],
|
| 947 |
+
"pcd": ["Catégorie"],
|
| 948 |
+
"pcm": ["Katigori"],
|
| 949 |
+
"pdc": ["Abdeeling", "Kategorie"],
|
| 950 |
+
"pfl": ["Kadegorie", "Kategorie", "Sachgrubb"],
|
| 951 |
+
"pi": ["विभाग"],
|
| 952 |
+
"pl": ["Kategoria"],
|
| 953 |
+
"pms": ["Categorìa"],
|
| 954 |
+
"pnb": ["گٹھ"],
|
| 955 |
+
"pnt": ["Κατηγορίαν"],
|
| 956 |
+
"ps": ["وېشنيزه"],
|
| 957 |
+
"pt": ["Categoria"],
|
| 958 |
+
"pwn": ["pinapapilipiliqan", "分类", "分類"],
|
| 959 |
+
"qu": ["Katiguriya"],
|
| 960 |
+
"rm": ["Categoria"],
|
| 961 |
+
"rmy": ["Shopni"],
|
| 962 |
+
"rn": ["Umuce"],
|
| 963 |
+
"ro": ["Categorie"],
|
| 964 |
+
"roa-rup": ["Categorie"],
|
| 965 |
+
"roa-tara": ["Categoria"],
|
| 966 |
+
"ru": ["К", "Категория"],
|
| 967 |
+
"rue": ["Категория", "Катеґорія"],
|
| 968 |
+
"rw": ["Ikiciro"],
|
| 969 |
+
"sa": ["वर्गः"],
|
| 970 |
+
"sah": ["Категория"],
|
| 971 |
+
"sat": ["ᱛᱷᱚᱠ"],
|
| 972 |
+
"sc": ["Categoria"],
|
| 973 |
+
"scn": ["Catigurìa"],
|
| 974 |
+
"sd": ["زمرو"],
|
| 975 |
+
"se": ["Kategoriija"],
|
| 976 |
+
"sg": ["Catégorie"],
|
| 977 |
+
"sh": ["KAT", "KT", "Kategorija", "КАТ", "КТ", "Категорија"],
|
| 978 |
+
"shi": ["Taggayt"],
|
| 979 |
+
"shn": ["ပိူင်ထၢၼ်ႈ"],
|
| 980 |
+
"si": ["ප්රවර්ගය"],
|
| 981 |
+
"sk": ["Kategória"],
|
| 982 |
+
"skr": ["ونکی"],
|
| 983 |
+
"sl": ["Kategorija"],
|
| 984 |
+
"smn": ["Luokka"],
|
| 985 |
+
"sq": ["Kategori", "Kategoria"],
|
| 986 |
+
"sr": ["Kategorija", "Категорија"],
|
| 987 |
+
"srn": ["Categorie", "Guru"],
|
| 988 |
+
"stq": ["Kategorie"],
|
| 989 |
+
"su": ["Kategori"],
|
| 990 |
+
"sv": ["Kategori"],
|
| 991 |
+
"sw": ["Jamii"],
|
| 992 |
+
"szl": ["Kategoria", "Kategoryjo"],
|
| 993 |
+
"szy": ["kakuniza", "分类", "分類"],
|
| 994 |
+
"ta": ["பகுப்பு"],
|
| 995 |
+
"tay": ["zyuwaw na", "分类", "分類"],
|
| 996 |
+
"tcy": ["ವರ್ಗೊ"],
|
| 997 |
+
"te": ["వర్గం"],
|
| 998 |
+
"tet": ["Kategoria", "Kategoría"],
|
| 999 |
+
"tg": ["Гурӯҳ"],
|
| 1000 |
+
"th": ["หมวดหมู่"],
|
| 1001 |
+
"ti": ["መደብ"],
|
| 1002 |
+
"tk": ["Kategoriýa"],
|
| 1003 |
+
"tl": ["Kategorya", "Kaurian"],
|
| 1004 |
+
"tly": ["Tispir"],
|
| 1005 |
+
"tn": ["Karolo"],
|
| 1006 |
+
"tpi": ["Grup"],
|
| 1007 |
+
"tr": ["KAT", "Kategori"],
|
| 1008 |
+
"trv": ["Snakun", "分类", "分類"],
|
| 1009 |
+
"tt": ["Törkem", "Категория", "Төркем"],
|
| 1010 |
+
"tw": ["Nkyekyεmu"],
|
| 1011 |
+
"ty": ["Catégorie"],
|
| 1012 |
+
"tyv": ["Аңгылал", "Категория"],
|
| 1013 |
+
"udm": ["Категория"],
|
| 1014 |
+
"ug": ["تۈر"],
|
| 1015 |
+
"uk": ["Категория", "Категорія"],
|
| 1016 |
+
"ur": ["زمرہ"],
|
| 1017 |
+
"uz": ["Kategoriya", "Turkum"],
|
| 1018 |
+
"vec": ["Categoria"],
|
| 1019 |
+
"vep": ["Kategorii"],
|
| 1020 |
+
"vi": ["Thể loại"],
|
| 1021 |
+
"vls": ["Categorie"],
|
| 1022 |
+
"vo": ["Klad"],
|
| 1023 |
+
"wa": ["Categoreye"],
|
| 1024 |
+
"war": ["Kaarangay"],
|
| 1025 |
+
"wo": ["Catégorie", "Wàll"],
|
| 1026 |
+
"wuu": ["分类", "分類"],
|
| 1027 |
+
"xal": ["Янз", "Әәшл"],
|
| 1028 |
+
"xmf": ["კატეგორია"],
|
| 1029 |
+
"yi": ["קאַטעגאָריע", "קאטעגאריע"],
|
| 1030 |
+
"yo": ["Ẹ̀ka"],
|
| 1031 |
+
"za": ["分类", "分類"],
|
| 1032 |
+
"zea": ["Categorie"],
|
| 1033 |
+
"zgh": ["ⴰⵙⵎⵉⵍ"],
|
| 1034 |
+
"zh": ["CAT", "分类", "分類"],
|
| 1035 |
+
"zh-classical": ["CAT", "分类", "分類"],
|
| 1036 |
+
"zh-min-nan": ["Lūi-pia̍t", "分类", "分類"],
|
| 1037 |
+
"zh-yue": ["分类", "分類", "类", "類"],
|
| 1038 |
+
}
|
| 1039 |
+
|
| 1040 |
+
# Note that Wikimedia servers have rate limited downloaders and they are capping the number of per-ip connections to 2
|
| 1041 |
+
# Their mirror sites do not have this cap: https://dumps.wikimedia.org/mirrors.html
|
| 1042 |
+
_HOST = "https://dumps.wikimedia.org"
|
| 1043 |
+
_URL_PATH_SEGMENT = "/{lang}wiki/{date}/"
|
| 1044 |
+
_INFO_FILE = "dumpstatus.json"
|
| 1045 |
+
|
| 1046 |
+
|
| 1047 |
+
_VERSION = datasets.Version("4.0.0", "")
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
class WikipediaConfig(datasets.BuilderConfig):
|
| 1051 |
+
"""BuilderConfig for Wikipedia."""
|
| 1052 |
+
|
| 1053 |
+
def __init__(self, language=None, date=None, host=_HOST, version=_VERSION, **kwargs):
|
| 1054 |
+
"""BuilderConfig for Wikipedia.
|
| 1055 |
+
Args:
|
| 1056 |
+
language (str): Language code for the Wikipedia dump to use.
|
| 1057 |
+
date (str): Date of the Wikipedia dump in YYYYMMDD format. A list of
|
| 1058 |
+
available dates can be found at https://dumps.wikimedia.org/enwiki/.
|
| 1059 |
+
host (str, defaults to 'https://dumps.wikimedia.org'): URL of the server that hosts the Wikipedia dump.
|
| 1060 |
+
It defaults to the official Wikimedia host, but this has rate limited downloaders and is capping the
|
| 1061 |
+
number of per-IP connections to 2.
|
| 1062 |
+
To parallelize data download, use a mirror: https://dumps.wikimedia.org/mirrors.html
|
| 1063 |
+
**kwargs: Keyword arguments forwarded to super.
|
| 1064 |
+
"""
|
| 1065 |
+
super().__init__(
|
| 1066 |
+
name=f"{date}.{language}",
|
| 1067 |
+
description=f"Wikipedia dataset for {language}, parsed from {date} dump.",
|
| 1068 |
+
version=version,
|
| 1069 |
+
**kwargs,
|
| 1070 |
+
)
|
| 1071 |
+
self.date = date
|
| 1072 |
+
self.language = language
|
| 1073 |
+
self.host = host.rstrip("/")
|
| 1074 |
+
|
| 1075 |
+
|
| 1076 |
+
class Wikipedia(datasets.GeneratorBasedBuilder):
|
| 1077 |
+
"""Wikipedia dataset."""
|
| 1078 |
+
|
| 1079 |
+
# Use mirror (your.org) to avoid download caps.
|
| 1080 |
+
BUILDER_CONFIG_CLASS = WikipediaConfig
|
| 1081 |
+
|
| 1082 |
+
def _info(self):
|
| 1083 |
+
return datasets.DatasetInfo(
|
| 1084 |
+
description=_DESCRIPTION,
|
| 1085 |
+
features=datasets.Features(
|
| 1086 |
+
{
|
| 1087 |
+
"id": datasets.Value("string"),
|
| 1088 |
+
"url": datasets.Value("string"),
|
| 1089 |
+
"title": datasets.Value("string"),
|
| 1090 |
+
"text": datasets.Value("string"),
|
| 1091 |
+
}
|
| 1092 |
+
),
|
| 1093 |
+
homepage=_HOMEPAGE,
|
| 1094 |
+
citation=_CITATION,
|
| 1095 |
+
)
|
| 1096 |
+
|
| 1097 |
+
def _get_base_url(self):
|
| 1098 |
+
return self.config.host + _URL_PATH_SEGMENT.format(
|
| 1099 |
+
lang=self.config.language.replace("-", "_"), date=self.config.date
|
| 1100 |
+
)
|
| 1101 |
+
|
| 1102 |
+
def _split_generators(self, dl_manager):
|
| 1103 |
+
# Download dump status info file
|
| 1104 |
+
info_url = self._get_base_url() + _INFO_FILE
|
| 1105 |
+
info_path = dl_manager.download_and_extract(info_url)
|
| 1106 |
+
with open(info_path, encoding="utf-8") as f:
|
| 1107 |
+
dump_info = json.load(f)
|
| 1108 |
+
multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
|
| 1109 |
+
if multistream_dump_info["status"] != "done":
|
| 1110 |
+
raise FileNotFoundError(
|
| 1111 |
+
f"Specified dump ({self._get_base_url()}) multistream status is not 'done':"
|
| 1112 |
+
f" {multistream_dump_info['status']}"
|
| 1113 |
+
)
|
| 1114 |
+
|
| 1115 |
+
# Download index and multistream XML data files
|
| 1116 |
+
total_bytes = 0
|
| 1117 |
+
index_urls = []
|
| 1118 |
+
xml_urls = []
|
| 1119 |
+
for fname, info in multistream_dump_info["files"].items():
|
| 1120 |
+
if ".txt" in fname:
|
| 1121 |
+
index_urls.append(self._get_base_url() + fname)
|
| 1122 |
+
elif ".xml" in fname:
|
| 1123 |
+
total_bytes += info["size"]
|
| 1124 |
+
xml_urls.append(self._get_base_url() + fname)
|
| 1125 |
+
data_urls = [*zip(sorted(index_urls), sorted(xml_urls))] # Parallelize data downloading
|
| 1126 |
+
data_paths = dl_manager.download(data_urls)
|
| 1127 |
+
|
| 1128 |
+
# Parallelize over concatenated multiple compressed streams (with 100 pages each)
|
| 1129 |
+
filepaths, starts, ends = [], [], []
|
| 1130 |
+
for index_path, xml_path in data_paths:
|
| 1131 |
+
index = _extract_index(index_path)
|
| 1132 |
+
for start, end in _pairwise(index):
|
| 1133 |
+
filepaths.append(xml_path)
|
| 1134 |
+
starts.append(start)
|
| 1135 |
+
ends.append(end)
|
| 1136 |
+
|
| 1137 |
+
return [
|
| 1138 |
+
datasets.SplitGenerator(
|
| 1139 |
+
name=datasets.Split.TRAIN,
|
| 1140 |
+
gen_kwargs={
|
| 1141 |
+
"filepaths": filepaths,
|
| 1142 |
+
"starts": starts,
|
| 1143 |
+
"ends": ends,
|
| 1144 |
+
},
|
| 1145 |
+
)
|
| 1146 |
+
]
|
| 1147 |
+
|
| 1148 |
+
def _generate_examples(self, filepaths, starts, ends):
|
| 1149 |
+
# No shuffle anymore
|
| 1150 |
+
for filepath, start, end in zip(filepaths, starts, ends):
|
| 1151 |
+
for id_, title, raw_content in _extract_content(filepath, start, end):
|
| 1152 |
+
yield from _clean_content((id_, title, raw_content), self.config.language)
|
| 1153 |
+
|
| 1154 |
+
|
| 1155 |
+
def _extract_index(filepath):
|
| 1156 |
+
with open(filepath, "rb") as compressed_file:
|
| 1157 |
+
binary_file = bz2.BZ2File(filename=compressed_file)
|
| 1158 |
+
text_file = io.TextIOWrapper(binary_file, encoding="utf-8", newline="")
|
| 1159 |
+
return sorted({int(row[0]) for row in csv.reader(text_file, delimiter=":")})
|
| 1160 |
+
|
| 1161 |
+
|
| 1162 |
+
def _pairwise(index):
|
| 1163 |
+
starts, ends = itertools.tee(index)
|
| 1164 |
+
_ = next(ends, None)
|
| 1165 |
+
return itertools.zip_longest(starts, ends)
|
| 1166 |
+
|
| 1167 |
+
|
| 1168 |
+
def _extract_content(filepath, start, end):
|
| 1169 |
+
"""Extract articles from a single stream of a multistream WikiMedia XML file."""
|
| 1170 |
+
logger.info("generating examples from = %s", filepath)
|
| 1171 |
+
with open(filepath, "rb") as compressed_file:
|
| 1172 |
+
compressed_file.seek(start)
|
| 1173 |
+
compressed_data = compressed_file.read(end - start if end else -1)
|
| 1174 |
+
binary_data = bz2.BZ2Decompressor().decompress(compressed_data)
|
| 1175 |
+
# Enclose within a single root node to avoid ParseError: junk after document element
|
| 1176 |
+
binary_data = b"<mediawiki>" + binary_data + b"</mediawiki>"
|
| 1177 |
+
with io.StringIO(binary_data.decode(encoding="utf-8")) as text_stream:
|
| 1178 |
+
for _, elem in etree.iterparse(text_stream):
|
| 1179 |
+
if not elem.tag.endswith("page"):
|
| 1180 |
+
continue
|
| 1181 |
+
namespace = elem.tag[:-4]
|
| 1182 |
+
ns = elem.find(f"./{namespace}ns").text
|
| 1183 |
+
redirect = elem.find(f"./{namespace}redirect")
|
| 1184 |
+
# Filter pages that are not in the "main" namespace or that are redirects
|
| 1185 |
+
if ns != "0" or redirect is not None:
|
| 1186 |
+
elem.clear()
|
| 1187 |
+
continue
|
| 1188 |
+
id_ = elem.find(f"./{namespace}id").text
|
| 1189 |
+
title = elem.find(f"./{namespace}title").text
|
| 1190 |
+
raw_content = elem.find(f"./{namespace}revision/{namespace}text").text
|
| 1191 |
+
elem.clear()
|
| 1192 |
+
# Filter empty pages
|
| 1193 |
+
if raw_content is None:
|
| 1194 |
+
continue
|
| 1195 |
+
yield id_, title, raw_content
|
| 1196 |
+
|
| 1197 |
+
|
| 1198 |
+
def _clean_content(inputs, language):
|
| 1199 |
+
"""Clean raw wikicode to extract text."""
|
| 1200 |
+
id_, title, raw_content = inputs
|
| 1201 |
+
try:
|
| 1202 |
+
text = _parse_and_clean_wikicode(raw_content, parser=mwparserfromhell, language=language)
|
| 1203 |
+
except mwparserfromhell.parser.ParserError as e:
|
| 1204 |
+
logger.error("mwparserfromhell ParseError: %s", e)
|
| 1205 |
+
return
|
| 1206 |
+
if not text:
|
| 1207 |
+
return
|
| 1208 |
+
url = _construct_url(title, language)
|
| 1209 |
+
yield id_, {"id": id_, "url": url, "title": title, "text": text}
|
| 1210 |
+
|
| 1211 |
+
|
| 1212 |
+
def _parse_and_clean_wikicode(raw_content, parser, language):
|
| 1213 |
+
"""Strip formatting and unwanted sections from raw page content."""
|
| 1214 |
+
wikicode = parser.parse(raw_content)
|
| 1215 |
+
|
| 1216 |
+
# Filters for magic words that are parser instructions -- e.g., __NOTOC__
|
| 1217 |
+
re_rm_magic = re.compile("__[A-Z]*__", flags=re.UNICODE)
|
| 1218 |
+
|
| 1219 |
+
# Filters for file/image links.
|
| 1220 |
+
media_prefixes = "|".join(["File", "Image", "Media"] + MEDIA_ALIASES.get(language, []))
|
| 1221 |
+
re_rm_wikilink = re.compile(f"^(?:{media_prefixes}):", flags=re.IGNORECASE | re.UNICODE)
|
| 1222 |
+
|
| 1223 |
+
def rm_wikilink(obj):
|
| 1224 |
+
return bool(re_rm_wikilink.match(str(obj.title)))
|
| 1225 |
+
|
| 1226 |
+
# Filters for references and tables
|
| 1227 |
+
def rm_tag(obj):
|
| 1228 |
+
return str(obj.tag) in {"ref", "table"}
|
| 1229 |
+
|
| 1230 |
+
# Leave category links in-place but remove the category prefixes
|
| 1231 |
+
cat_prefixes = "|".join(["Category"] + CAT_ALIASES.get(language, []))
|
| 1232 |
+
re_clean_wikilink = re.compile(f"^(?:{cat_prefixes}):", flags=re.IGNORECASE | re.UNICODE)
|
| 1233 |
+
|
| 1234 |
+
def is_category(obj):
|
| 1235 |
+
return bool(re_clean_wikilink.match(str(obj.title)))
|
| 1236 |
+
|
| 1237 |
+
def clean_wikilink(obj):
|
| 1238 |
+
text = obj.__strip__()
|
| 1239 |
+
text = re.sub(re_clean_wikilink, "", text)
|
| 1240 |
+
obj.text = text
|
| 1241 |
+
|
| 1242 |
+
def try_replace_obj(obj):
|
| 1243 |
+
try:
|
| 1244 |
+
clean_wikilink(obj)
|
| 1245 |
+
except ValueError:
|
| 1246 |
+
# For unknown reasons, objects are sometimes not found.
|
| 1247 |
+
pass
|
| 1248 |
+
|
| 1249 |
+
def try_remove_obj(obj, section):
|
| 1250 |
+
try:
|
| 1251 |
+
section.remove(obj)
|
| 1252 |
+
except ValueError:
|
| 1253 |
+
# For unknown reasons, objects are sometimes not found.
|
| 1254 |
+
pass
|
| 1255 |
+
|
| 1256 |
+
section_text = []
|
| 1257 |
+
# Filter individual sections to clean.
|
| 1258 |
+
for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
|
| 1259 |
+
for obj in section.ifilter_wikilinks(recursive=True):
|
| 1260 |
+
if rm_wikilink(obj):
|
| 1261 |
+
try_remove_obj(obj, section)
|
| 1262 |
+
elif is_category(obj):
|
| 1263 |
+
try_replace_obj(obj)
|
| 1264 |
+
for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
|
| 1265 |
+
try_remove_obj(obj, section)
|
| 1266 |
+
|
| 1267 |
+
section_text.append(re.sub(re_rm_magic, "", section.strip_code().strip()))
|
| 1268 |
+
return "\n\n".join(section_text)
|
| 1269 |
+
|
| 1270 |
+
|
| 1271 |
+
def _construct_url(title, language):
|
| 1272 |
+
# See: https://meta.wikimedia.org/wiki/Help:URL
|
| 1273 |
+
return f"https://{language}.wikipedia.org/wiki/{quote(title)}"
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datatrove[io]==0.6.0
|
| 2 |
+
gradio==5.49.1
|
| 3 |
+
mwparserfromhell==0.7.2
|
| 4 |
+
datasets==4.0.0
|