guipenedo HF Staff commited on
Commit
1f29f01
·
1 Parent(s): da7f028
Files changed (4) hide show
  1. README.md +4 -4
  2. app.py +293 -0
  3. default_wiki_pipeline.py +1273 -0
  4. requirements.txt +4 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Finewiki Viewer
3
- emoji: 🏢
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
 
1
  ---
2
+ title: FineWiki Viewer
3
+ emoji: 🌐
4
+ colorFrom: white
5
+ colorTo: black
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # _LOCAL_TMP = "/fsx/guilherme/tmp"
3
+ # try:
4
+ # os.makedirs(_LOCAL_TMP, exist_ok=True)
5
+ # os.environ.setdefault("TMPDIR", _LOCAL_TMP)
6
+ # os.environ.setdefault("TEMP", _LOCAL_TMP)
7
+ # os.environ.setdefault("TMP", _LOCAL_TMP)
8
+ # _GRADIO_TMP = os.path.join(_LOCAL_TMP, "gradio")
9
+ # os.makedirs(_GRADIO_TMP, exist_ok=True)
10
+ # os.environ.setdefault("GRADIO_TEMP_DIR", _GRADIO_TMP)
11
+ # except Exception:
12
+ # pass
13
+
14
+ import gradio as gr
15
+ from datatrove.pipeline.readers import ParquetReader
16
+ from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
17
+
18
+ lang_list = ['ab', 'ace', 'ady', 'af', 'als', 'alt', 'ami', 'am', 'ang', 'anp', 'an', 'arc', 'ar', 'ary', 'arz', 'ast', 'as', 'atj', 'avk', 'av', 'awa', 'ay', 'azb', 'az', 'ban', 'bar', 'bat_smg', 'ba', 'bbc', 'bcl', 'be', 'bg', 'bh', 'bi', 'bjn', 'blk', 'bm', 'bn', 'bo', 'bpy', 'br', 'bs', 'bug', 'bxr', 'ca', 'cbk_zam', 'cdo', 'ceb', 'ce', 'chr', 'ch', 'chy', 'ckb', 'co', 'crh', 'cr', 'csb', 'cs', 'cu', 'cv', 'cy', 'dag', 'da', 'de', 'dga', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz', 'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fat', 'fa', 'ff', 'fiu_vro', 'fi', 'fj', 'fon', 'fo', 'frp', 'frr', 'fr', 'fur', 'fy', 'gag', 'gan', 'ga', 'gcr', 'gd', 'glk', 'gl', 'gn', 'gom', 'gor', 'got', 'gpe', 'guc', 'gur', 'gu', 'guw', 'gv', 'hak', 'ha', 'haw', 'he', 'hif', 'hi', 'hr', 'hsb', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'ie', 'ig', 'ik', 'ilo', 'inh', 'io', 'is', 'it', 'iu', 'jam', 'ja', 'jbo', 'jv', 'kaa', 'kab', 'ka', 'kbd', 'kbp', 'kcg', 'kg', 'ki', 'kk', 'kl', 'km', 'kn', 'koi', 'ko', 'krc', 'ksh', 'ks', 'ku', 'kv', 'kw', 'ky', 'lad', 'la', 'lbe', 'lb', 'lez', 'lfn', 'lg', 'lij', 'li', 'lld', 'lmo', 'ln', 'lo', 'ltg', 'lt', 'lv', 'mad', 'mai', 'map_bms', 'mdf', 'mg', 'mhr', 'min', 'mi', 'mk', 'ml', 'mni', 'mn', 'mnw', 'mrj', 'mr', 'ms', 'mt', 'mwl', 'myv', 'my', 'mzn', 'nah', 'nap', 'nds_nl', 'nds', 'ne', 'new', 'nia', 'nl', 'nn', 'nov', 'no', 'nqo', 'nrm', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pag', 'pam', 'pap', 'pa', 'pcd', 'pcm', 'pdc', 'pfl', 'pih', 'pi', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'pwn', 'qu', 'rm', 'rmy', 'rn', 'roa_rup', 'roa_tara', 'ro', 'rue', 'ru', 'rw', 'sah', 'sat', 'sa', 'scn', 'sco', 'sc', 'sd', 'se', 'sg', 'shi', 'shn', 'sh', 'simple', 'si', 'skr', 'sk', 'sl', 'smn', 'sm', 'sn', 'so', 'sq', 'srn', 'sr', 'ss', 'stq', 'st', 'su', 'sv', 'sw', 'szl', 'szy', 'ta', 'tay', 'tcy', 'tet', 'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tly', 'tn', 'to', 'tpi', 'trv', 'tr', 'ts', 'tt', 'tum', 'tw', 'tyv', 'ty', 'udm', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 've', 'vi', 'vls', 'vo', 'war', 'wa', 'wo', 'wuu', 'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zgh', 'zh_classical', 'zh_min_nan', 'zh_yue', 'zh', 'zu']
19
+
20
+
21
+ def _build_header_markdown(doc) -> str:
22
+ meta = doc.metadata or {}
23
+ title = meta.get("title") or ""
24
+ page_id = meta.get("page_id") or meta.get("id") or ""
25
+ wikidata_id = meta.get("wikidata_id") or ""
26
+ url = meta.get("url") or ""
27
+ parts = []
28
+ if title:
29
+ parts.append(f"**Title**: {title}")
30
+ if page_id:
31
+ parts.append(f"**Page ID**: {page_id}")
32
+ if wikidata_id:
33
+ parts.append(f"**Wikidata ID**: {wikidata_id}")
34
+ header = " | ".join(parts)
35
+ if url:
36
+ header += f"\n[{url}]({url})"
37
+ return header
38
+ def matches_filters(doc, require_has_math: bool | None, require_has_infobox: bool | None) -> bool:
39
+ meta = doc.metadata or {}
40
+ if require_has_math and not bool(meta.get("has_math")):
41
+ return False
42
+ if require_has_infobox and not meta.get("infoboxes"):
43
+ return False
44
+ return True
45
+
46
+
47
+ def find_next_matching_from(docs_cache, reader_iter, start_idx: int, require_has_math: bool | None, require_has_infobox: bool | None):
48
+ # Scan cache first
49
+ i = max(-1, start_idx)
50
+ while i + 1 < len(docs_cache):
51
+ i += 1
52
+ if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
53
+ return i, docs_cache, reader_iter
54
+ # Stream until found or exhausted
55
+ while True:
56
+ prev_len = len(docs_cache)
57
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
58
+ if len(docs_cache) == prev_len:
59
+ break
60
+ if matches_filters(docs_cache[-1], require_has_math, require_has_infobox):
61
+ return len(docs_cache) - 1, docs_cache, reader_iter
62
+ return -1, docs_cache, reader_iter
63
+
64
+ def render_iframe(url: str, height: int = 800) -> str:
65
+ safe_url = url or "about:blank"
66
+ return (
67
+ f'<iframe src="{safe_url}" '
68
+ f'style="width:100%; height:{height}px; border:0;" loading="lazy"></iframe>'
69
+ )
70
+
71
+
72
+ def _safe_url_from_metadata(meta: dict) -> str:
73
+ meta = meta or {}
74
+ return meta.get("url") or ""
75
+
76
+
77
+ def _extract_language(meta: dict) -> str:
78
+ # Try common metadata fields for language code
79
+ meta = meta or {}
80
+ lang = meta.get("lang") or meta.get("language")
81
+ if lang:
82
+ return str(lang)
83
+ wiki = meta.get("wiki") or meta.get("wikiname") or ""
84
+ base = str(wiki).removesuffix("_namespace_0") if wiki else ""
85
+ if base.endswith("wiki"):
86
+ return base[:-4]
87
+ return base or "en"
88
+
89
+
90
+ def _ensure_until_index(docs_cache, reader_iter, target_idx: int):
91
+ if reader_iter is None:
92
+ return docs_cache, reader_iter
93
+ while len(docs_cache) <= target_idx:
94
+ try:
95
+ nxt = next(reader_iter)
96
+ except StopIteration:
97
+ break
98
+ docs_cache.append(nxt)
99
+ return docs_cache, reader_iter
100
+
101
+
102
+ def on_select_language(lang: str, require_has_math: bool, require_has_infobox: bool):
103
+ """Load documents for the selected language from HF Parquet and display."""
104
+ language = (lang or "").strip()
105
+ if not language:
106
+ return (-1, [], None, "Select a language.", {}, "", [], render_iframe(""))
107
+ try:
108
+ path = f"hf://datasets/HuggingFaceFW/finewiki/data/{language}wiki"
109
+ reader_iter = ParquetReader(path)()
110
+ except Exception as e:
111
+ return (-1, [], None, f"Failed to read: {e}", {}, "", [], render_iframe(""))
112
+ docs_cache = []
113
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
114
+ if not docs_cache:
115
+ return (-1, [], reader_iter, "No documents found.", {}, "", [], render_iframe(""))
116
+ # Find first doc matching filters (starting before 0)
117
+ idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
118
+ if idx == -1:
119
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
120
+ left, left_meta, md, info, right, header = render_idx(docs_cache, idx)
121
+ return (idx, docs_cache, reader_iter, left, left_meta, header, md, info, right)
122
+
123
+
124
+ def on_find(docs_cache, idx: int, reader_iter, id_query: str, require_has_math: bool, require_has_infobox: bool):
125
+ query = (id_query or "").strip()
126
+ if not docs_cache and reader_iter is None:
127
+ return -1, docs_cache, reader_iter, "No documents loaded.", {}, "", [], render_iframe("")
128
+ if not query:
129
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
130
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
131
+ if new_idx == -1:
132
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
133
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
134
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
135
+ # Exact match in cache
136
+ for i, doc in enumerate(docs_cache):
137
+ meta = (getattr(doc, "metadata", None) or {})
138
+ doc_id = (getattr(doc, "id", None) or "")
139
+ url = meta.get("url") or ""
140
+ if doc_id == query or meta.get("wikidata_id") == query or url == query:
141
+ left, left_meta, md, info, right, header = render_idx(docs_cache, i)
142
+ if matches_filters(doc, require_has_math, require_has_infobox):
143
+ return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
144
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
145
+ if new_idx == -1:
146
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
147
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
148
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
149
+ # Suffix match in cache
150
+ for i, doc in enumerate(docs_cache):
151
+ doc_id = (getattr(doc, "id", None) or "")
152
+ meta = (getattr(doc, "metadata", None) or {})
153
+ url = meta.get("url") or ""
154
+ if doc_id.endswith(f"/{query}") or url.endswith(query):
155
+ left, left_meta, md, info, right, header = render_idx(docs_cache, i)
156
+ if matches_filters(doc, require_has_math, require_has_infobox):
157
+ return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
158
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
159
+ if new_idx == -1:
160
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
161
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
162
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
163
+ # Stream forward until found or exhausted
164
+ found_idx = None
165
+ while True:
166
+ prev_len = len(docs_cache)
167
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
168
+ if len(docs_cache) == prev_len:
169
+ break
170
+ doc = docs_cache[-1]
171
+ meta = (getattr(doc, "metadata", None) or {})
172
+ doc_id = (getattr(doc, "id", None) or "")
173
+ url = meta.get("url") or ""
174
+ if doc_id == query or meta.get("wikidata_id") == query or url.endswith(query) or url == query or doc_id.endswith(f"/{query}"):
175
+ found_idx = len(docs_cache) - 1
176
+ break
177
+ if found_idx is not None:
178
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, found_idx - 1, require_has_math, require_has_infobox)
179
+ if new_idx == -1:
180
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
181
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
182
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
183
+ target_idx = 0 if docs_cache else -1
184
+ if target_idx == -1:
185
+ return -1, docs_cache, reader_iter, "No documents found.", {}, "", [], render_iframe("")
186
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, target_idx - 1, require_has_math, require_has_infobox)
187
+ if new_idx == -1:
188
+ return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
189
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
190
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
191
+
192
+
193
+ def show_doc(doc):
194
+ left = getattr(doc, "text", "")
195
+ meta = getattr(doc, "metadata", None) or {}
196
+ # Clean markdown using default_wiki_pipeline helper
197
+ md_text = meta.get("wikitext")
198
+ md_clean = _parse_and_clean_wikicode(md_text, parser=mwparserfromhell, language=_extract_language(meta))
199
+ info = meta.get("infoboxes", [])
200
+ right = render_iframe(_safe_url_from_metadata(meta))
201
+ header = _build_header_markdown(doc)
202
+ return left, meta, md_clean, info, right, header
203
+
204
+
205
+ def render_idx(docs, idx: int):
206
+ if not docs:
207
+ return "No documents.", {}, "", [], render_iframe(""), ""
208
+ idx = max(0, min(idx, len(docs) - 1))
209
+ doc = docs[idx]
210
+ left, left_meta, md, info, right, header = show_doc(doc)
211
+ return left, left_meta, md, info, right, header
212
+
213
+
214
+ def on_prev(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
215
+ if not docs_cache:
216
+ # Try to ensure at least first doc is loaded
217
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
218
+ if not docs_cache:
219
+ return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
220
+ new_idx = max(0, idx - 1)
221
+ # Apply filters going backwards by scanning from start to new_idx
222
+ filtered_idx = new_idx
223
+ if new_idx >= 0:
224
+ for i in range(new_idx, -1, -1):
225
+ if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
226
+ filtered_idx = i
227
+ break
228
+ left, left_meta, md, info, right, header = render_idx(docs_cache, filtered_idx)
229
+ return filtered_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
230
+
231
+
232
+ def on_next(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
233
+ target_idx = idx + 1 if idx >= 0 else 0
234
+ docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, target_idx)
235
+ if not docs_cache:
236
+ return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
237
+ new_idx = min(len(docs_cache) - 1, target_idx)
238
+ # Apply filters forward
239
+ new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, idx, require_has_math, require_has_infobox)
240
+ if new_idx == -1:
241
+ return idx, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe("")
242
+ left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
243
+ return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
244
+
245
+
246
+ with gr.Blocks() as demo:
247
+ idx_state = gr.State(value=-1)
248
+ docs_state = gr.State(value=[])
249
+ iter_state = gr.State(value=None)
250
+
251
+ with gr.Row():
252
+ # Full-width controls row for navigation
253
+ with gr.Column():
254
+ with gr.Row():
255
+ language_select = gr.Dropdown(choices=lang_list, value="en", label="Language")
256
+ with gr.Row():
257
+ prev_btn = gr.Button("Previous")
258
+ next_btn = gr.Button("Next")
259
+ header_md = gr.Markdown()
260
+ with gr.Column():
261
+ with gr.Row():
262
+ require_has_math = gr.Checkbox(label="Has math", value=False)
263
+ require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
264
+ with gr.Row():
265
+ id_input = gr.Textbox(label="Wikidata ID/URL/Page ID", placeholder="e.g., Q42 or https://... or 12345", lines=1)
266
+ find_btn = gr.Button("Find")
267
+ with gr.Row():
268
+ show_wiki = gr.Checkbox(label="Show wikimedia/wikipedia extraction", value=False)
269
+ show_preview = gr.Checkbox(label="Show preview", value=True)
270
+ show_infoboxes = gr.Checkbox(label="Show infoboxes", value=True)
271
+ with gr.Row():
272
+ with gr.Column():
273
+ left_text = gr.Textbox(label="FineWiki extractions", lines=30)
274
+ left_meta = gr.JSON(label="Metadata")
275
+ with gr.Column():
276
+ right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30)
277
+ right_iframe = gr.HTML(label="Original Page")
278
+ right_infoboxes = gr.JSON(label="Infoboxes")
279
+
280
+ language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
281
+ demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
282
+ find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
283
+
284
+ # Visibility toggles driven directly by checkbox changes
285
+ show_wiki.change(lambda v: gr.update(visible=v), inputs=[show_wiki], outputs=[right_markdown])
286
+ show_preview.change(lambda v: gr.update(visible=v), inputs=[show_preview], outputs=[right_iframe])
287
+ show_infoboxes.change(lambda v: gr.update(visible=v), inputs=[show_infoboxes], outputs=[right_infoboxes])
288
+ prev_btn.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
289
+ next_btn.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe])
290
+
291
+
292
+ if __name__ == "__main__":
293
+ demo.launch(server_name="0.0.0.0", server_port=7641)
default_wiki_pipeline.py ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # directly copied from https://huggingface.co/datasets/wikimedia/wikipedia/blob/script/wikipedia.py
2
+ """Wikipedia dataset containing cleaned articles of all languages."""
3
+
4
+
5
+ import bz2
6
+ import csv
7
+ import io
8
+ import itertools
9
+ import json
10
+ import re
11
+ import xml.etree.ElementTree as etree
12
+ from urllib.parse import quote
13
+
14
+ import mwparserfromhell
15
+
16
+ import datasets
17
+
18
+
19
+ logger = datasets.logging.get_logger(__name__)
20
+
21
+
22
+ _HOMEPAGE = "https://dumps.wikimedia.org"
23
+
24
+ _CITATION = """\
25
+ @ONLINE {wikidump,
26
+ author = {Wikimedia Foundation},
27
+ title = {Wikimedia Downloads},
28
+ url = {https://dumps.wikimedia.org}
29
+ }
30
+ """
31
+
32
+ _DESCRIPTION = """\
33
+ Wikipedia dataset containing cleaned articles of all languages.
34
+ The datasets are built from the Wikipedia dump
35
+ (https://dumps.wikimedia.org/) with one split per language. Each example
36
+ contains the content of one full Wikipedia article with cleaning to strip
37
+ markdown and unwanted sections (references, etc.).
38
+ """
39
+
40
+ _LICENSE = (
41
+ "This work is licensed under the Creative Commons Attribution-ShareAlike "
42
+ "3.0 Unported License. To view a copy of this license, visit "
43
+ "http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to "
44
+ "Creative Commons, PO Box 1866, Mountain View, CA 94042, USA."
45
+ )
46
+
47
+ # Source: https://meta.wikimedia.org/wiki/List_of_Wikipedias
48
+ # Number: 326 = 339 - 13 (retrieved: 2023-11-17)
49
+ WIKIPEDIA_LANGUAGES = [
50
+ "ab",
51
+ "ace",
52
+ "ady",
53
+ "af",
54
+ "als",
55
+ "alt",
56
+ "am",
57
+ "ami",
58
+ "an",
59
+ "ang",
60
+ "anp",
61
+ "ar",
62
+ "arc",
63
+ "ary",
64
+ "arz",
65
+ "as",
66
+ "ast",
67
+ "atj",
68
+ "av",
69
+ "avk",
70
+ "awa",
71
+ "ay",
72
+ "az",
73
+ "azb",
74
+ "ba",
75
+ "ban",
76
+ "bar",
77
+ "bat-smg",
78
+ "bbc",
79
+ "bcl",
80
+ "be",
81
+ "be-tarask",
82
+ "bg",
83
+ "bh",
84
+ "bi",
85
+ "bjn",
86
+ "blk",
87
+ "bm",
88
+ "bn",
89
+ "bo",
90
+ "bpy",
91
+ "br",
92
+ "bs",
93
+ "bug",
94
+ "bxr",
95
+ "ca",
96
+ "cbk-zam",
97
+ "cdo",
98
+ "ce",
99
+ "ceb",
100
+ "ch",
101
+ "chr",
102
+ "chy",
103
+ "ckb",
104
+ "co",
105
+ "cr",
106
+ "crh",
107
+ "cs",
108
+ "csb",
109
+ "cu",
110
+ "cv",
111
+ "cy",
112
+ "da",
113
+ "dag",
114
+ "de",
115
+ "dga",
116
+ "din",
117
+ "diq",
118
+ "dsb",
119
+ "dty",
120
+ "dv",
121
+ "dz",
122
+ "ee",
123
+ "el",
124
+ "eml",
125
+ "en",
126
+ "eo",
127
+ "es",
128
+ "et",
129
+ "eu",
130
+ "ext",
131
+ "fa",
132
+ "fat",
133
+ "ff",
134
+ "fi",
135
+ "fiu-vro",
136
+ "fj",
137
+ "fo",
138
+ "fon",
139
+ "fr",
140
+ "frp",
141
+ "frr",
142
+ "fur",
143
+ "fy",
144
+ "ga",
145
+ "gag",
146
+ "gan",
147
+ "gcr",
148
+ "gd",
149
+ "gl",
150
+ "glk",
151
+ "gn",
152
+ "gom",
153
+ "gor",
154
+ "got",
155
+ "gpe",
156
+ "gu",
157
+ "guc",
158
+ "gur",
159
+ "guw",
160
+ "gv",
161
+ "ha",
162
+ "hak",
163
+ "haw",
164
+ "he",
165
+ "hi",
166
+ "hif",
167
+ "hr",
168
+ "hsb",
169
+ "ht",
170
+ "hu",
171
+ "hy",
172
+ "hyw",
173
+ "ia",
174
+ "id",
175
+ "ie",
176
+ "ig",
177
+ "ik",
178
+ "ilo",
179
+ "inh",
180
+ "io",
181
+ "is",
182
+ "it",
183
+ "iu",
184
+ "ja",
185
+ "jam",
186
+ "jbo",
187
+ "jv",
188
+ "ka",
189
+ "kaa",
190
+ "kab",
191
+ "kbd",
192
+ "kbp",
193
+ "kcg",
194
+ "kg",
195
+ "ki",
196
+ "kk",
197
+ "kl",
198
+ "km",
199
+ "kn",
200
+ "ko",
201
+ "koi",
202
+ "krc",
203
+ "ks",
204
+ "ksh",
205
+ "ku",
206
+ "kv",
207
+ "kw",
208
+ "ky",
209
+ "la",
210
+ "lad",
211
+ "lb",
212
+ "lbe",
213
+ "lez",
214
+ "lfn",
215
+ "lg",
216
+ "li",
217
+ "lij",
218
+ "lld",
219
+ "lmo",
220
+ "ln",
221
+ "lo",
222
+ "lt",
223
+ "ltg",
224
+ "lv",
225
+ "mad",
226
+ "mai",
227
+ "map-bms",
228
+ "mdf",
229
+ "mg",
230
+ "mhr",
231
+ "mi",
232
+ "min",
233
+ "mk",
234
+ "ml",
235
+ "mn",
236
+ "mni",
237
+ "mnw",
238
+ "mr",
239
+ "mrj",
240
+ "ms",
241
+ "mt",
242
+ "mwl",
243
+ "my",
244
+ "myv",
245
+ "mzn",
246
+ "nah",
247
+ "nap",
248
+ "nds",
249
+ "nds-nl",
250
+ "ne",
251
+ "new",
252
+ "nia",
253
+ "nl",
254
+ "nn",
255
+ "no",
256
+ "nov",
257
+ "nqo",
258
+ "nrm",
259
+ "nso",
260
+ "nv",
261
+ "ny",
262
+ "oc",
263
+ "olo",
264
+ "om",
265
+ "or",
266
+ "os",
267
+ "pa",
268
+ "pag",
269
+ "pam",
270
+ "pap",
271
+ "pcd",
272
+ "pcm",
273
+ "pdc",
274
+ "pfl",
275
+ "pi",
276
+ "pih",
277
+ "pl",
278
+ "pms",
279
+ "pnb",
280
+ "pnt",
281
+ "ps",
282
+ "pt",
283
+ "pwn",
284
+ "qu",
285
+ "rm",
286
+ "rmy",
287
+ "rn",
288
+ "ro",
289
+ "roa-rup",
290
+ "roa-tara",
291
+ "ru",
292
+ "rue",
293
+ "rw",
294
+ "sa",
295
+ "sah",
296
+ "sat",
297
+ "sc",
298
+ "scn",
299
+ "sco",
300
+ "sd",
301
+ "se",
302
+ "sg",
303
+ "sh",
304
+ "shi",
305
+ "shn",
306
+ "si",
307
+ "simple",
308
+ "sk",
309
+ "skr",
310
+ "sl",
311
+ "sm",
312
+ "smn",
313
+ "sn",
314
+ "so",
315
+ "sq",
316
+ "sr",
317
+ "srn",
318
+ "ss",
319
+ "st",
320
+ "stq",
321
+ "su",
322
+ "sv",
323
+ "sw",
324
+ "szl",
325
+ "szy",
326
+ "ta",
327
+ "tay",
328
+ "tcy",
329
+ "te",
330
+ "tet",
331
+ "tg",
332
+ "th",
333
+ "ti",
334
+ "tk",
335
+ "tl",
336
+ "tly",
337
+ "tn",
338
+ "to",
339
+ "tpi",
340
+ "tr",
341
+ "trv",
342
+ "ts",
343
+ "tt",
344
+ "tum",
345
+ "tw",
346
+ "ty",
347
+ "tyv",
348
+ "udm",
349
+ "ug",
350
+ "uk",
351
+ "ur",
352
+ "uz",
353
+ "ve",
354
+ "vec",
355
+ "vep",
356
+ "vi",
357
+ "vls",
358
+ "vo",
359
+ "wa",
360
+ "war",
361
+ "wo",
362
+ "wuu",
363
+ "xal",
364
+ "xh",
365
+ "xmf",
366
+ "yi",
367
+ "yo",
368
+ "za",
369
+ "zea",
370
+ "zgh",
371
+ "zh",
372
+ "zh-classical",
373
+ "zh-min-nan",
374
+ "zh-yue",
375
+ "zu",
376
+ ]
377
+
378
+ # Source: for each Wikipedia language code (example shown for "ab"), aliases for namespaces -2 and 6 accessed via this API call:
379
+ # https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2
380
+ # Retrieved: 2023-11-17
381
+ MEDIA_ALIASES = {
382
+ "ab": ["Амедиа", "Афаил", "Изображение", "Медиа", "Файл"],
383
+ "ace": ["Alat", "Berkas", "Beureukaih", "Gambar"],
384
+ "ady": ["Медиа"],
385
+ "af": ["Beeld", "Lêer"],
386
+ "als": ["Bild", "Datei", "Medium"],
387
+ "alt": ["Изображение", "Медиа", "Файл"],
388
+ "am": ["ስዕል", "ፋይል"],
389
+ "ami": ["Faylo", "Mitiya", "图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
390
+ "an": ["Imachen", "Imagen"],
391
+ "ang": ["Biliþ", "Ymele"],
392
+ "anp": ["फाईल", "मीडिया"],
393
+ "ar": ["صورة", "ملف", "ميديا", "وسائط"],
394
+ "arc": ["ܠܦܦܐ", "ܡܝܕܝܐ"],
395
+ "ary": ["صورة", "فيشي", "ملف", "ميديا", "وسائط"],
396
+ "arz": ["صورة", "ملف", "ميديا", "وسائط"],
397
+ "as": ["चित्र", "চিত্র", "চিত্ৰ", "মাধ্যম"],
398
+ "ast": ["Archivu", "Ficheru", "Imagen", "Imaxe", "Imaxen", "Medios"],
399
+ "atj": ["Natisinahikaniwoc", "Tipatcimoctakewin"],
400
+ "av": ["Изображение", "Медиа", "Файл"],
401
+ "avk": ["Ewava", "Imagen", "Iyeltak", "Kanaca", "Mamind", "Изображение"],
402
+ "awa": ["फाइल", "मीडिया"],
403
+ "ay": ["Archivo", "Imagen", "Medio"],
404
+ "az": ["Fayl", "Mediya", "Şəkil"],
405
+ "azb": ["تصویر", "رسانه", "رسانه‌ای", "فایل", "مدیا"],
406
+ "ba": ["Изображение", "Медиа", "Рәсем", "Файл"],
407
+ "ban": ["Berkas", "Gambar", "Média"],
408
+ "bar": ["Bild", "Datei", "Medium"],
409
+ "bat-smg": ["Abruozdielis", "Medėjė", "Vaizdas"],
410
+ "bbc": ["Ugasan"],
411
+ "bcl": ["Ladawan", "Medio"],
412
+ "be": ["Выява", "Мультымедыя", "Файл"],
413
+ "be-tarask": ["Выява", "Мэдыя", "Файл"],
414
+ "bg": ["Картинка", "Медия", "Файл"],
415
+ "bh": ["चित्र", "मीडिया"],
416
+ "bjn": ["Barakas", "Berkas", "Gambar"],
417
+ "blk": ["ဖုဲင်", "မီဒီယာ"],
418
+ "bm": ["Fichier", "Média"],
419
+ "bn": ["চিত্র", "মিডিয়া"],
420
+ "bpy": ["ছবি", "মিডিয়া"],
421
+ "br": ["Restr", "Skeudenn"],
422
+ "bs": ["Datoteka", "Medija", "Mediji", "Slika"],
423
+ "bug": ["Berkas", "Gambar"],
424
+ "bxr": ["Изображение", "Меди", "Файл"],
425
+ "ca": ["Fitxer", "Imatge"],
426
+ "cbk-zam": ["Archivo", "Imagen", "Medio"],
427
+ "cdo": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
428
+ "ce": ["Изображение", "Медиа", "Медйа", "Сурт", "Файл", "Хlум"],
429
+ "ceb": ["Imahen", "Medya", "Payl"],
430
+ "ch": ["Litratu"],
431
+ "ckb": ["میدیا", "پەڕگە"],
432
+ "co": ["Immagine"],
433
+ "crh": ["Fayl", "Resim", "Медиа", "Ресим", "Файл"],
434
+ "cs": ["Média", "Obrázok", "Soubor"],
435
+ "csb": ["Grafika", "Òbrôzk"],
436
+ "cu": ["Ви́дъ", "Видъ", "Дѣло", "Срѣдьства"],
437
+ "cv": ["Изображение", "Медиа", "Ӳкерчĕк"],
438
+ "cy": ["Delwedd"],
439
+ "da": ["Billede", "Fil"],
440
+ "dag": ["Lahabali kɔligu", "Miidiya"],
441
+ "de": ["Bild", "Datei", "Medium"],
442
+ "dga": ["Duoro bimbu zie", "Duoro kɔre"],
443
+ "din": ["Apamduööt", "Ciɛl"],
444
+ "diq": ["Dosya", "Medya"],
445
+ "dsb": ["Bild", "Dataja", "Medija", "Wobraz"],
446
+ "dty": ["चित्र", "मिडिया"],
447
+ "dv": ["މީޑިއާ", "ފައިލު", "ފައިލް"],
448
+ "el": ["Αρχείο", "Εικόνα", "Μέσο", "Μέσον"],
449
+ "eml": ["Immagine"],
450
+ "eo": ["Aŭdvidaĵo", "Dosiero"],
451
+ "es": ["Archivo", "Imagen", "Medio"],
452
+ "et": ["Fail", "Meedia", "Pilt"],
453
+ "eu": ["Fitxategi", "Irudi"],
454
+ "ext": ["Archivu", "Imagen", "Mediu"],
455
+ "fa": ["تصویر", "رسانه", "رسانه‌ای", "مدیا", "پرونده"],
456
+ "fat": ["Fael"],
457
+ "ff": ["Fichier", "Média"],
458
+ "fi": ["Kuva", "Tiedosto"],
459
+ "fiu-vro": ["Meediä", "Pilt"],
460
+ "fo": ["Miðil", "Mynd"],
461
+ "fon": ["Wékpo", "Yɛwliɖonuji"],
462
+ "fr": ["Fichier", "Média"],
463
+ "frp": ["Fichiér", "Mèdia", "Émâge"],
464
+ "frr": ["Bild", "Datei", "Medium"],
465
+ "fur": ["Figure", "Immagine"],
466
+ "fy": ["Ofbyld"],
467
+ "ga": ["Meán", "Íomhá"],
468
+ "gag": ["Dosya", "Dosye", "Mediya", "Medya", "Resim"],
469
+ "gan": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "文檔", "档案", "��案"],
470
+ "gcr": ["Fiché", "Médja"],
471
+ "gd": ["Faidhle", "Meadhan"],
472
+ "gl": ["Arquivo", "Ficheiro", "Imagem", "Imaxe"],
473
+ "glk": ["تصویر", "رسانه", "رسانه‌ای", "فاىل", "مديا", "پرونده"],
474
+ "gn": ["Imagen", "Medio", "Ta'ãnga"],
475
+ "gom": ["फायल", "माध्यम", "मिडिया"],
476
+ "gor": ["Berkas", "Gambar"],
477
+ "got": ["𐍆𐌴𐌹𐌻𐌰"],
478
+ "gu": ["ચિત્ર", "દ્રશ્ય-શ્રાવ્ય (મિડિયા)"],
479
+ "guc": ["Anaajaalaa", "Ayaakuwapülee", "Imagen"],
480
+ "gur": ["Faali", "Miidiya"],
481
+ "guw": ["Wepo"],
482
+ "gv": ["Coadan", "Meanyn"],
483
+ "hak": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
484
+ "haw": ["Kiʻi", "Pāpaho", "Waihona"],
485
+ "he": ["מדיה", "קו", "קובץ", "תמונה"],
486
+ "hi": ["चित्र", "मीडिया"],
487
+ "hif": ["file", "saadhan"],
488
+ "hr": ["DT", "Datoteka", "Mediji", "Slika"],
489
+ "hsb": ["Bild", "Dataja", "Wobraz"],
490
+ "ht": ["Fichye", "Imaj", "Medya"],
491
+ "hu": ["Fájl", "Kép", "Média"],
492
+ "hy": ["Մեդիա", "Պատկեր"],
493
+ "hyw": ["Մեդիա", "Պատկեր"],
494
+ "ia": ["Imagine", "Multimedia"],
495
+ "id": ["Berkas", "Gambar"],
496
+ "ig": ["Midia", "Nká", "Usòrò", "Ákwúkwó orünotu"],
497
+ "ilo": ["Midia", "Papeles"],
498
+ "inh": ["Изображение", "Медиа", "Файл"],
499
+ "io": ["Arkivo", "Imajo"],
500
+ "is": ["Miðill", "Mynd"],
501
+ "it": ["Immagine"],
502
+ "ja": ["ファイル", "メディア", "画像"],
503
+ "jbo": ["datnyvei", "velsku"],
504
+ "jv": ["Barkas", "Gambar", "Medhia", "Médhia"],
505
+ "ka": ["მედია", "სურათი", "ფაილი"],
506
+ "kaa": ["Fayl", "Su'wret", "Swret", "Taspa", "Сурет", "Таспа", "تاسپا", "سۋرەت"],
507
+ "kab": ["Tugna"],
508
+ "kbd": ["Медиа", "Файл"],
509
+ "kbp": ["Fichier", "Média"],
510
+ "kcg": ["Fail"],
511
+ "kg": ["Fisye"],
512
+ "kk": ["Swret", "Taspa", "Сурет", "Таспа", "تاسپا", "سۋرەت"],
513
+ "kl": ["Billede", "Fiileq", "Fil"],
514
+ "km": ["មីឌា", "មេឌា", "រូបភាព", "ឯកសារ"],
515
+ "kn": ["ಚಿತ್ರ", "ಮೀಡಿಯ"],
516
+ "ko": ["그림", "미디어", "파일"],
517
+ "koi": ["Изображение", "Медиа", "Файл"],
518
+ "krc": ["Изображение", "Медиа", "Файл"],
519
+ "ks": ["فَیِل", "میڈیا"],
520
+ "ksh": ["Beld", "Belld", "Bild", "Datei", "Medie", "Medium", "Meedije", "Meedijum"],
521
+ "ku": ["Medya", "Wêne", "میدیا", "پەڕگە"],
522
+ "kv": ["Изображение", "Медиа", "Файл"],
523
+ "kw": ["Restren"],
524
+ "ky": ["Медиа", "Файл"],
525
+ "la": ["Fasciculus", "Imago"],
526
+ "lad": ["Archivo", "Dossia", "Dosya", "Imagen", "Meddia", "Medya"],
527
+ "lb": ["Bild", "Fichier"],
528
+ "lbe": ["Изображение", "Медиа", "Сурат"],
529
+ "lez": ["Mediya", "Şəkil", "Изображение", "Медиа", "Файл"],
530
+ "lfn": ["Fix"],
531
+ "li": ["Aafbeilding", "Afbeelding", "Plaetje"],
532
+ "lij": ["Immaggine", "Immagine"],
533
+ "lld": ["Immagine"],
534
+ "lmo": ["Archivi", "Immagine", "Imàjine"],
535
+ "ln": ["Fichier", "Média"],
536
+ "lo": ["ສື່", "ສື່ອ", "ຮູບ"],
537
+ "lt": ["Medija", "Vaizdas"],
538
+ "ltg": ["Fails", "Medeja"],
539
+ "lv": ["Attēls"],
540
+ "mad": ["Bhengkek", "Gambar", "Mèḍia"],
541
+ "mai": ["फाइल", "मेडिया"],
542
+ "map-bms": ["Barkas", "Gambar", "Medhia", "Médhia"],
543
+ "mdf": ["Изображение", "Медиа", "Няйф"],
544
+ "mg": ["Média", "Rakitra", "Sary"],
545
+ "mhr": ["Изображение", "Медиа", "Файл"],
546
+ "min": ["Berkas", "Gambar"],
547
+ "mk": ["Медиум", "Медија", "Податотека", "Слика"],
548
+ "ml": ["ചി", "ചിത്രം", "പ്ര", "പ്രമാണം", "മീഡിയ"],
549
+ "mn": ["Зураг", "Медиа", "Файл"],
550
+ "mni": ["ꯃꯦꯗꯤꯌꯥ", "ꯐꯥꯏꯜ"],
551
+ "mnw": ["မဳဒဳယာ", "ဝှာင်"],
552
+ "mr": ["चित्र", "मिडिया"],
553
+ "mrj": ["Изображение", "Медиа", "Файл"],
554
+ "ms": ["Fail", "Imej"],
555
+ "mt": ["Medja", "Midja", "Stampa"],
556
+ "mwl": ["Arquivo", "Fexeiro", "Ficheiro", "Imagem", "Multimédia"],
557
+ "my": ["ဖိုင်", "မီဒီယာ"],
558
+ "myv": ["Артовкс", "Изображение", "Медия"],
559
+ "mzn": ["تصویر", "رسانه", "رسانه‌ای", "مدیا", "مه‌دیا", "پرونده"],
560
+ "nah": ["Imagen", "Mēdiatl", "Īxiptli"],
561
+ "nap": ["Fiùra", "Immagine"],
562
+ "nds": ["Bild", "Datei"],
563
+ "nds-nl": ["Afbeelding", "Bestaand", "Ofbeelding"],
564
+ "ne": ["चित्र", "मीडिया"],
565
+ "new": ["किपा", "माध्यम"],
566
+ "nia": ["Berkas", "Gambar"],
567
+ "nl": ["Afbeelding", "Bestand"],
568
+ "nn": ["Bilde", "Fil", "Filpeikar"],
569
+ "no": ["Bilde", "Fil", "Medium"],
570
+ "nqo": ["ߞߐߕߐ߮", "ߟߊߛߋߢߊߥߙߍ"],
571
+ "nrm": ["Fichier", "Média"],
572
+ "nso": ["Seswantšho"],
573
+ "nv": ["Eʼelyaaígíí"],
574
+ "oc": ["Fichièr", "Imatge", "Mèdia"],
575
+ "olo": ["Failu", "Kuva", "Medii"],
576
+ "or": ["ଫାଇଲ", "ମାଧ୍ୟମ"],
577
+ "os": ["Изображение", "Медиа", "Ныв", "Файл"],
578
+ "pa": ["ਤਸਵੀਰ", "ਮੀਡੀਆ"],
579
+ "pcd": ["Fichier", "Média"],
580
+ "pdc": ["Bild", "Datei", "Feil", "Medium"],
581
+ "pfl": ["Bild", "Dadai", "Datei", "Medium"],
582
+ "pi": ["पटिमा", "मीडिया"],
583
+ "pl": ["Grafika", "Plik"],
584
+ "pms": ["Figura", "Immagine"],
585
+ "pnb": ["تصویر", "فائل", "میڈیا"],
586
+ "pnt": ["Αρχείον", "Εικόνα", "Εικόναν", "Μέσον"],
587
+ "ps": ["انځور", "دوتنه", "رسنۍ"],
588
+ "pt": ["Arquivo", "Ficheiro", "Imagem", "Multimédia"],
589
+ "pwn": [
590
+ "mitiya",
591
+ "sineqetj a vecik",
592
+ "图像",
593
+ "图片",
594
+ "圖像",
595
+ "圖片",
596
+ "媒体",
597
+ "媒体文件",
598
+ "媒体档案",
599
+ "媒體",
600
+ "媒體文件",
601
+ "媒體檔案",
602
+ "文件",
603
+ "档案",
604
+ "檔案",
605
+ ],
606
+ "qu": ["Imagen", "Midya", "Rikcha"],
607
+ "rm": ["Bild", "Datoteca", "Multimedia"],
608
+ "rmy": ["Chitro", "Fişier", "Imagine", "Mediya"],
609
+ "rn": ["Dosiye"],
610
+ "ro": ["Fişier", "Fișier", "Imagine"],
611
+ "roa-rup": ["Fişier", "Fișier", "Imagine"],
612
+ "roa-tara": ["Immagine"],
613
+ "ru": ["Изображение", "Медиа", "Файл"],
614
+ "rue": ["Зображення", "Изображение", "Медиа", "Медіа", "Файл"],
615
+ "rw": ["Dosiye", "Itangazamakuru"],
616
+ "sa": ["चित्रं", "चित्रम्", "माध्यम", "माध्यमम्", "सञ्चिका"],
617
+ "sah": ["Билэ", "Изображение", "Миэдьийэ", "Ойуу"],
618
+ "sat": ["ᱢᱤᱰᱤᱭᱟ", "ᱨᱮᱫ"],
619
+ "sc": ["Immàgini"],
620
+ "scn": ["Immagine", "Mmàggini", "Mèdia"],
621
+ "sd": ["ذريعات", "عڪس", "فائل"],
622
+ "se": ["Bilde", "Fiila", "Kuva"],
623
+ "sg": ["Fichier", "Média"],
624
+ "sh": ["DT", "Datoteka", "Fotografija", "Medija", "Mediji", "Slika", "ДТ", "Датотека", "Медиј"],
625
+ "shi": ["Afaylu", "Midya"],
626
+ "shn": ["သိုဝ်ႇၶၢဝ်ႇ", "ၾၢႆႇ"],
627
+ "si": ["ගොනුව", "මාධ්‍යය", "රූපය"],
628
+ "sk": ["Médiá", "Obrázok", "Súbor"],
629
+ "skr": ["فائل", "میڈیا"],
630
+ "sl": ["Datoteka", "Slika"],
631
+ "smn": ["Kuva", "Tiätuvuárkká"],
632
+ "sq": ["Figura", "Skeda"],
633
+ "sr": [
634
+ "Datoteka",
635
+ "Fotografija",
636
+ "Medij",
637
+ "Medija",
638
+ "Slika",
639
+ "Датотека",
640
+ "Медиј",
641
+ "Медија",
642
+ "Слика",
643
+ "Фотографија",
644
+ ],
645
+ "srn": ["Afbeelding", "Gefre"],
646
+ "stq": ["Bielde", "Bild"],
647
+ "su": ["Gambar", "Média"],
648
+ "sv": ["Bild", "Fil"],
649
+ "sw": ["Faili", "Picha"],
650
+ "szl": ["Grafika", "Plik"],
651
+ "szy": ["myiti", "tangan", "图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
652
+ "ta": ["ஊடகம்", "படிமம்"],
653
+ "tay": [
654
+ "biru' na zayzyuwaw",
655
+ "biru’ na zayzyuwaw",
656
+ "media",
657
+ "图像",
658
+ "图片",
659
+ "圖像",
660
+ "圖片",
661
+ "媒体",
662
+ "媒体文件",
663
+ "媒体档案",
664
+ "媒體",
665
+ "媒體文件",
666
+ "媒體檔案",
667
+ "文件",
668
+ "档案",
669
+ "檔案",
670
+ ],
671
+ "tcy": ["ಫೈಲ್", "ಮಾದ್ಯಮೊ"],
672
+ "te": ["దస్త్రం", "ఫైలు", "బొమ్మ", "మీడియా"],
673
+ "tet": ["Arquivo", "Imagem", "Imajen"],
674
+ "tg": ["Акс", "Медиа"],
675
+ "th": ["ภาพ", "สื่อ", "ไฟล์"],
676
+ "ti": ["ሜድያ", "ፋይል"],
677
+ "tk": ["Faýl"],
678
+ "tl": ["Midya", "Talaksan"],
679
+ "tly": ["Fajl", "Medja"],
680
+ "tn": ["Pego", "Setshwantsho"],
681
+ "tpi": ["Fail"],
682
+ "tr": ["Dosya", "Medya", "Ortam", "Resim"],
683
+ "trv": [
684
+ "Meyti",
685
+ "Patas bntasan",
686
+ "图像",
687
+ "图片",
688
+ "圖像",
689
+ "圖片",
690
+ "媒体",
691
+ "媒体文件",
692
+ "媒体档案",
693
+ "媒體",
694
+ "媒體文件",
695
+ "媒體檔案",
696
+ "文件",
697
+ "档案",
698
+ "檔案",
699
+ ],
700
+ "tt": ["Räsem", "Изображение", "Медиа", "Рәсем", "Файл"],
701
+ "ty": ["Fichier", "Média"],
702
+ "tyv": ["Изображение", "Медиа", "Файл"],
703
+ "udm": ["Изображение", "Медиа", "Суред", "Файл"],
704
+ "ug": ["ھۆججەت", "ۋاسىتە"],
705
+ "uk": ["Зображення", "Изображение", "Медиа", "Медіа", "Файл"],
706
+ "ur": ["تصویر", "زریعہ", "فا��ل", "ملف", "میڈیا", "وسیط"],
707
+ "uz": ["Fayl", "Mediya", "Tasvir"],
708
+ "vec": ["Immagine", "Imàjine", "Mèdia"],
709
+ "vep": ["Fail", "Pilt"],
710
+ "vi": ["Hình", "Phương tiện", "Tập tin"],
711
+ "vls": ["Afbeelding", "Ofbeeldienge"],
712
+ "vo": ["Magod", "Nünamakanäd", "Ragiv"],
713
+ "wa": ["Imådje"],
714
+ "war": ["Fayl", "Medya", "Paypay"],
715
+ "wo": ["Dencukaay", "Xibaarukaay"],
716
+ "wuu": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
717
+ "xal": ["Аһар", "Боомг", "Зург", "Изображение"],
718
+ "xmf": ["მედია", "სურათი", "ფაილი"],
719
+ "yi": ["בילד", "טעקע", "מעדיע", "תמונה"],
720
+ "yo": ["Amóhùnmáwòrán", "Fáìlì", "Àwòrán"],
721
+ "za": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
722
+ "zea": ["Afbeelding", "Plaetje"],
723
+ "zgh": ["ⴰⴼⴰⵢⵍⵓ", "ⵎⵉⴷⵢⴰ"],
724
+ "zh": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
725
+ "zh-classical": ["图像", "图片", "圖像", "圖片", "媒体", "媒体文件", "媒体档案", "媒體", "媒體文件", "媒體檔案", "文件", "档案", "檔案"],
726
+ "zh-min-nan": [
727
+ "Mûi-thé",
728
+ "tóng-àn",
729
+ "图像",
730
+ "图片",
731
+ "圖像",
732
+ "圖片",
733
+ "媒体",
734
+ "媒体文件",
735
+ "媒体档案",
736
+ "媒體",
737
+ "媒體文件",
738
+ "媒體檔案",
739
+ "文件",
740
+ "档案",
741
+ "檔案",
742
+ ],
743
+ "zh-yue": ["图", "图像", "圖", "圖像", "媒体", "媒體", "文件", "档", "档案", "檔", "檔案"],
744
+ }
745
+
746
+ # Source: for each Wikipedia language code (example shown for "ab"), aliases for namespace 14 accessed via this API call:
747
+ # https://ab.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespacealiases|namespaces&format=json&formatversion=2
748
+ # Retrieved: 2023-11-17
749
+ CAT_ALIASES = {
750
+ "ab": ["Акатегориа", "Категория"],
751
+ "ace": ["Kategori", "Kawan"],
752
+ "af": ["Kategorie"],
753
+ "als": ["Kategorie"],
754
+ "alt": ["Категория"],
755
+ "am": ["መደብ"],
756
+ "ami": ["Kasasiwasiw", "分类", "分類"],
757
+ "an": ["Categoría"],
758
+ "ang": ["Flocc"],
759
+ "anp": ["श्रेणी"],
760
+ "ar": ["تصنيف"],
761
+ "arc": ["ܣܕܪܐ"],
762
+ "ary": ["تصنيف"],
763
+ "arz": ["تصنيف"],
764
+ "as": ["CAT", "श्रेणी", "শ্রেণী", "শ্ৰেণী"],
765
+ "ast": ["Categoría"],
766
+ "atj": ["Tipanictawin"],
767
+ "av": ["Категория"],
768
+ "avk": ["Loma"],
769
+ "awa": ["श्रेणी"],
770
+ "ay": ["Categoría"],
771
+ "az": ["Kateqoriya"],
772
+ "azb": ["بؤلمه"],
773
+ "ba": ["Категория", "Төркөм"],
774
+ "ban": ["Kategori"],
775
+ "bar": ["Kategorie"],
776
+ "bat-smg": ["Kategorija", "Kateguorėjė"],
777
+ "bbc": ["Horong"],
778
+ "bcl": ["Kategorya"],
779
+ "be": ["Катэгорыя"],
780
+ "be-tarask": ["Катэгорыя"],
781
+ "bg": ["Категория"],
782
+ "bh": ["श्रेणी"],
783
+ "bjn": ["Kategori", "Tumbung"],
784
+ "blk": ["ကဏ္ဍ"],
785
+ "bm": ["Catégorie"],
786
+ "bn": ["বিষয়শ্রেণী"],
787
+ "bpy": ["থাক"],
788
+ "br": ["Rummad"],
789
+ "bs": ["Kategorija"],
790
+ "bug": ["Kategori"],
791
+ "bxr": ["Категори", "Категория"],
792
+ "ca": ["Categoria"],
793
+ "cbk-zam": ["Categoría"],
794
+ "cdo": ["分类", "分類"],
795
+ "ce": ["Кадегар", "Категори", "Тоба"],
796
+ "ceb": ["Kategoriya"],
797
+ "ch": ["Katigoria"],
798
+ "ckb": ["پ", "پۆل"],
799
+ "co": ["Categoria"],
800
+ "crh": ["Kategoriya", "Категория"],
801
+ "cs": ["Kategorie"],
802
+ "csb": ["Kategòrëjô"],
803
+ "cu": ["Категория", "Катигорїꙗ", "Катигорї"],
804
+ "cv": ["Категори"],
805
+ "cy": ["Categori"],
806
+ "da": ["Kategori"],
807
+ "dag": ["Pubu"],
808
+ "de": ["Kategorie"],
809
+ "dga": ["Gbuli"],
810
+ "din": ["Bekätakthook"],
811
+ "diq": ["Kategori", "Kategoriye"],
812
+ "dsb": ["Kategorija"],
813
+ "dty": ["श्रेणी"],
814
+ "dv": ["ޤިސްމު"],
815
+ "el": ["Κατηγορία"],
816
+ "eml": ["Categoria"],
817
+ "eo": ["Kategorio"],
818
+ "es": ["CAT", "Categoría"],
819
+ "et": ["Kategooria"],
820
+ "eu": ["Kategoria"],
821
+ "ext": ["Categoria", "Categoría"],
822
+ "fa": ["رده"],
823
+ "fat": ["Nkyekyεmu"],
824
+ "ff": ["Catégorie"],
825
+ "fi": ["Luokka"],
826
+ "fiu-vro": ["Katõgooria"],
827
+ "fo": ["Bólkur"],
828
+ "fon": ["Akpaxwé"],
829
+ "fr": ["Catégorie"],
830
+ "frp": ["Catègorie"],
831
+ "frr": ["Kategorie"],
832
+ "fur": ["Categorie"],
833
+ "fy": ["Kategory"],
834
+ "ga": ["Catagóir", "Rang"],
835
+ "gag": ["Kategori", "Kategoriya"],
836
+ "gan": ["分类", "分類"],
837
+ "gcr": ["Katégori"],
838
+ "gd": ["Roinn-seòrsa"],
839
+ "gl": ["Categoría"],
840
+ "glk": ["جرگه", "رده"],
841
+ "gn": ["Ñemohenda"],
842
+ "gom": ["वर्ग", "श्रेणी"],
843
+ "gor": ["Dalala"],
844
+ "got": ["𐌷𐌰𐌽𐍃𐌰"],
845
+ "gu": ["CAT", "શ્રે", "શ્રેણી"],
846
+ "guc": ["Akotchajülee sünülia"],
847
+ "gur": ["Buuri buuri"],
848
+ "guw": ["Adà"],
849
+ "gv": ["Ronney"],
850
+ "hak": ["分类", "分類"],
851
+ "haw": ["Māhele"],
852
+ "he": ["קט", "קטגוריה"],
853
+ "hi": ["श्र", "श्रेणी"],
854
+ "hif": ["vibhag"],
855
+ "hr": ["CT", "KT", "Kategorija"],
856
+ "hsb": ["Kategorija"],
857
+ "ht": ["Kategori"],
858
+ "hu": ["Kategória"],
859
+ "hy": ["Կատեգորիա"],
860
+ "hyw": ["Ստորոգութիւն"],
861
+ "ia": ["Categoria"],
862
+ "id": ["Kategori"],
863
+ "ie": ["Categorie"],
864
+ "ig": ["Ébéonọr", "Òtù"],
865
+ "ilo": ["Kategoria"],
866
+ "inh": ["ОагӀат"],
867
+ "io": ["Kategorio"],
868
+ "is": ["Flokkur"],
869
+ "it": ["CAT", "Categoria"],
870
+ "ja": ["カテゴリ"],
871
+ "jbo": ["klesi"],
872
+ "jv": ["Kategori"],
873
+ "ka": ["კატეგორია"],
874
+ "kaa": ["Kategoriya", "Sanat", "Санат", "سانات"],
875
+ "kab": ["Taggayt"],
876
+ "kbd": ["Категориэ", "Категория"],
877
+ "kbp": ["Catégorie"],
878
+ "kcg": ["Sa"],
879
+ "kg": ["Kalasi"],
880
+ "kk": ["Sanat", "Санат", "سانات"],
881
+ "kl": ["Kategori", "Sumut atassuseq"],
882
+ "km": ["ចំណាត់ក្រុម", "ចំណាត់ថ្នាក់ក្រុម", "ចំនាត់ថ្នាក់ក្រុម"],
883
+ "kn": ["ವರ್ಗ"],
884
+ "ko": ["분류"],
885
+ "koi": ["Категория"],
886
+ "krc": ["Категория"],
887
+ "ks": ["زٲژ"],
888
+ "ksh": ["Kategorie", "Katejori", "Kattejori", "Saachjrop", "Saachjropp", "Saachjrupp", "Sachjrop"],
889
+ "ku": ["Kategorî", "پۆل"],
890
+ "kv": ["Категория"],
891
+ "kw": ["Class", "Klass"],
892
+ "ky": ["Категория"],
893
+ "la": ["Categoria"],
894
+ "lad": ["Categoría", "Kateggoría", "Katēggoría"],
895
+ "lb": ["Kategorie"],
896
+ "lbe": ["Категория"],
897
+ "lez": ["Категория"],
898
+ "lfn": ["Categoria"],
899
+ "li": ["Categorie", "Kategorie"],
900
+ "lij": ["Categoria", "Categorîa"],
901
+ "lld": ["Categoria"],
902
+ "lmo": ["Categoria", "Categuria"],
903
+ "ln": ["Catégorie"],
904
+ "lo": ["ໝວດ"],
905
+ "lt": ["Kategorija"],
906
+ "ltg": ["Kategoreja"],
907
+ "lv": ["Kategorija"],
908
+ "mad": ["Bhângsa"],
909
+ "mai": ["CA", "श्रेणी"],
910
+ "map-bms": ["Kategori"],
911
+ "mdf": ["Категорие", "Категория"],
912
+ "mg": ["Catégorie", "Sokajy"],
913
+ "mhr": ["Категорий", "Категория"],
914
+ "min": ["Kategori"],
915
+ "mk": ["Категорија"],
916
+ "ml": ["വ", "വി", "വിഭാഗം", "വർഗ്ഗം"],
917
+ "mn": ["Ангилал"],
918
+ "mni": ["ꯃꯆꯥꯈꯥꯏꯕ"],
919
+ "mnw": ["ကဏ္ဍ"],
920
+ "mr": ["वर्ग"],
921
+ "mrj": ["Категори", "Категория"],
922
+ "ms": ["Kategori"],
923
+ "mt": ["Kategorija"],
924
+ "mwl": ["Catadorie", "Categoria"],
925
+ "my": ["ကဏ္ဍ"],
926
+ "myv": ["Категория"],
927
+ "mzn": ["رج", "رده"],
928
+ "nah": ["Categoría", "Neneuhcāyōtl"],
929
+ "nap": ["Categoria", "Categurìa"],
930
+ "nds": ["Kategorie"],
931
+ "nds-nl": ["Categorie", "Kategorie", "Kattegerie"],
932
+ "ne": ["श्रेणी"],
933
+ "new": ["पुचः"],
934
+ "nia": ["Kategori"],
935
+ "nl": ["Categorie"],
936
+ "nn": ["Kategori"],
937
+ "no": ["Kategori"],
938
+ "nqo": ["ߦߌߟߡߊ"],
939
+ "nrm": ["Catégorie"],
940
+ "nso": ["Setensele"],
941
+ "nv": ["Tʼááłáhági átʼéego"],
942
+ "oc": ["Categoria"],
943
+ "olo": ["Kategourii"],
944
+ "or": ["ବିଭାଗ", "ଶ୍ରେଣୀ"],
945
+ "os": ["Категори"],
946
+ "pa": ["ਸ਼੍ਰੇਣੀ"],
947
+ "pcd": ["Catégorie"],
948
+ "pcm": ["Katigori"],
949
+ "pdc": ["Abdeeling", "Kategorie"],
950
+ "pfl": ["Kadegorie", "Kategorie", "Sachgrubb"],
951
+ "pi": ["विभाग"],
952
+ "pl": ["Kategoria"],
953
+ "pms": ["Categorìa"],
954
+ "pnb": ["گٹھ"],
955
+ "pnt": ["Κατηγορίαν"],
956
+ "ps": ["وېشنيزه"],
957
+ "pt": ["Categoria"],
958
+ "pwn": ["pinapapilipiliqan", "分类", "分類"],
959
+ "qu": ["Katiguriya"],
960
+ "rm": ["Categoria"],
961
+ "rmy": ["Shopni"],
962
+ "rn": ["Umuce"],
963
+ "ro": ["Categorie"],
964
+ "roa-rup": ["Categorie"],
965
+ "roa-tara": ["Categoria"],
966
+ "ru": ["К", "Категория"],
967
+ "rue": ["Категория", "Катеґорія"],
968
+ "rw": ["Ikiciro"],
969
+ "sa": ["वर्गः"],
970
+ "sah": ["Категория"],
971
+ "sat": ["ᱛᱷᱚᱠ"],
972
+ "sc": ["Categoria"],
973
+ "scn": ["Catigurìa"],
974
+ "sd": ["زمرو"],
975
+ "se": ["Kategoriija"],
976
+ "sg": ["Catégorie"],
977
+ "sh": ["KAT", "KT", "Kategorija", "КАТ", "КТ", "Категорија"],
978
+ "shi": ["Taggayt"],
979
+ "shn": ["ပိူင်ထၢၼ်ႈ"],
980
+ "si": ["ප්‍රවර්ගය"],
981
+ "sk": ["Kategória"],
982
+ "skr": ["ونکی"],
983
+ "sl": ["Kategorija"],
984
+ "smn": ["Luokka"],
985
+ "sq": ["Kategori", "Kategoria"],
986
+ "sr": ["Kategorija", "Категорија"],
987
+ "srn": ["Categorie", "Guru"],
988
+ "stq": ["Kategorie"],
989
+ "su": ["Kategori"],
990
+ "sv": ["Kategori"],
991
+ "sw": ["Jamii"],
992
+ "szl": ["Kategoria", "Kategoryjo"],
993
+ "szy": ["kakuniza", "分类", "分類"],
994
+ "ta": ["பகுப்பு"],
995
+ "tay": ["zyuwaw na", "分类", "分類"],
996
+ "tcy": ["ವರ್ಗೊ"],
997
+ "te": ["వర్గం"],
998
+ "tet": ["Kategoria", "Kategoría"],
999
+ "tg": ["Гурӯҳ"],
1000
+ "th": ["หมวดหมู่"],
1001
+ "ti": ["መደብ"],
1002
+ "tk": ["Kategoriýa"],
1003
+ "tl": ["Kategorya", "Kaurian"],
1004
+ "tly": ["Tispir"],
1005
+ "tn": ["Karolo"],
1006
+ "tpi": ["Grup"],
1007
+ "tr": ["KAT", "Kategori"],
1008
+ "trv": ["Snakun", "分类", "分類"],
1009
+ "tt": ["Törkem", "Категория", "Төркем"],
1010
+ "tw": ["Nkyekyεmu"],
1011
+ "ty": ["Catégorie"],
1012
+ "tyv": ["Аңгылал", "Категория"],
1013
+ "udm": ["Категория"],
1014
+ "ug": ["تۈر"],
1015
+ "uk": ["Категория", "Категорія"],
1016
+ "ur": ["زمرہ"],
1017
+ "uz": ["Kategoriya", "Turkum"],
1018
+ "vec": ["Categoria"],
1019
+ "vep": ["Kategorii"],
1020
+ "vi": ["Thể loại"],
1021
+ "vls": ["Categorie"],
1022
+ "vo": ["Klad"],
1023
+ "wa": ["Categoreye"],
1024
+ "war": ["Kaarangay"],
1025
+ "wo": ["Catégorie", "Wàll"],
1026
+ "wuu": ["分类", "分類"],
1027
+ "xal": ["Янз", "Әәшл"],
1028
+ "xmf": ["კატეგორია"],
1029
+ "yi": ["קאַטעגאָריע", "קאטעגאריע"],
1030
+ "yo": ["Ẹ̀ka"],
1031
+ "za": ["分类", "分類"],
1032
+ "zea": ["Categorie"],
1033
+ "zgh": ["ⴰⵙⵎⵉⵍ"],
1034
+ "zh": ["CAT", "分类", "分類"],
1035
+ "zh-classical": ["CAT", "分类", "分類"],
1036
+ "zh-min-nan": ["Lūi-pia̍t", "分类", "分類"],
1037
+ "zh-yue": ["分类", "分類", "类", "類"],
1038
+ }
1039
+
1040
+ # Note that Wikimedia servers have rate limited downloaders and they are capping the number of per-ip connections to 2
1041
+ # Their mirror sites do not have this cap: https://dumps.wikimedia.org/mirrors.html
1042
+ _HOST = "https://dumps.wikimedia.org"
1043
+ _URL_PATH_SEGMENT = "/{lang}wiki/{date}/"
1044
+ _INFO_FILE = "dumpstatus.json"
1045
+
1046
+
1047
+ _VERSION = datasets.Version("4.0.0", "")
1048
+
1049
+
1050
+ class WikipediaConfig(datasets.BuilderConfig):
1051
+ """BuilderConfig for Wikipedia."""
1052
+
1053
+ def __init__(self, language=None, date=None, host=_HOST, version=_VERSION, **kwargs):
1054
+ """BuilderConfig for Wikipedia.
1055
+ Args:
1056
+ language (str): Language code for the Wikipedia dump to use.
1057
+ date (str): Date of the Wikipedia dump in YYYYMMDD format. A list of
1058
+ available dates can be found at https://dumps.wikimedia.org/enwiki/.
1059
+ host (str, defaults to 'https://dumps.wikimedia.org'): URL of the server that hosts the Wikipedia dump.
1060
+ It defaults to the official Wikimedia host, but this has rate limited downloaders and is capping the
1061
+ number of per-IP connections to 2.
1062
+ To parallelize data download, use a mirror: https://dumps.wikimedia.org/mirrors.html
1063
+ **kwargs: Keyword arguments forwarded to super.
1064
+ """
1065
+ super().__init__(
1066
+ name=f"{date}.{language}",
1067
+ description=f"Wikipedia dataset for {language}, parsed from {date} dump.",
1068
+ version=version,
1069
+ **kwargs,
1070
+ )
1071
+ self.date = date
1072
+ self.language = language
1073
+ self.host = host.rstrip("/")
1074
+
1075
+
1076
+ class Wikipedia(datasets.GeneratorBasedBuilder):
1077
+ """Wikipedia dataset."""
1078
+
1079
+ # Use mirror (your.org) to avoid download caps.
1080
+ BUILDER_CONFIG_CLASS = WikipediaConfig
1081
+
1082
+ def _info(self):
1083
+ return datasets.DatasetInfo(
1084
+ description=_DESCRIPTION,
1085
+ features=datasets.Features(
1086
+ {
1087
+ "id": datasets.Value("string"),
1088
+ "url": datasets.Value("string"),
1089
+ "title": datasets.Value("string"),
1090
+ "text": datasets.Value("string"),
1091
+ }
1092
+ ),
1093
+ homepage=_HOMEPAGE,
1094
+ citation=_CITATION,
1095
+ )
1096
+
1097
+ def _get_base_url(self):
1098
+ return self.config.host + _URL_PATH_SEGMENT.format(
1099
+ lang=self.config.language.replace("-", "_"), date=self.config.date
1100
+ )
1101
+
1102
+ def _split_generators(self, dl_manager):
1103
+ # Download dump status info file
1104
+ info_url = self._get_base_url() + _INFO_FILE
1105
+ info_path = dl_manager.download_and_extract(info_url)
1106
+ with open(info_path, encoding="utf-8") as f:
1107
+ dump_info = json.load(f)
1108
+ multistream_dump_info = dump_info["jobs"]["articlesmultistreamdump"]
1109
+ if multistream_dump_info["status"] != "done":
1110
+ raise FileNotFoundError(
1111
+ f"Specified dump ({self._get_base_url()}) multistream status is not 'done':"
1112
+ f" {multistream_dump_info['status']}"
1113
+ )
1114
+
1115
+ # Download index and multistream XML data files
1116
+ total_bytes = 0
1117
+ index_urls = []
1118
+ xml_urls = []
1119
+ for fname, info in multistream_dump_info["files"].items():
1120
+ if ".txt" in fname:
1121
+ index_urls.append(self._get_base_url() + fname)
1122
+ elif ".xml" in fname:
1123
+ total_bytes += info["size"]
1124
+ xml_urls.append(self._get_base_url() + fname)
1125
+ data_urls = [*zip(sorted(index_urls), sorted(xml_urls))] # Parallelize data downloading
1126
+ data_paths = dl_manager.download(data_urls)
1127
+
1128
+ # Parallelize over concatenated multiple compressed streams (with 100 pages each)
1129
+ filepaths, starts, ends = [], [], []
1130
+ for index_path, xml_path in data_paths:
1131
+ index = _extract_index(index_path)
1132
+ for start, end in _pairwise(index):
1133
+ filepaths.append(xml_path)
1134
+ starts.append(start)
1135
+ ends.append(end)
1136
+
1137
+ return [
1138
+ datasets.SplitGenerator(
1139
+ name=datasets.Split.TRAIN,
1140
+ gen_kwargs={
1141
+ "filepaths": filepaths,
1142
+ "starts": starts,
1143
+ "ends": ends,
1144
+ },
1145
+ )
1146
+ ]
1147
+
1148
+ def _generate_examples(self, filepaths, starts, ends):
1149
+ # No shuffle anymore
1150
+ for filepath, start, end in zip(filepaths, starts, ends):
1151
+ for id_, title, raw_content in _extract_content(filepath, start, end):
1152
+ yield from _clean_content((id_, title, raw_content), self.config.language)
1153
+
1154
+
1155
+ def _extract_index(filepath):
1156
+ with open(filepath, "rb") as compressed_file:
1157
+ binary_file = bz2.BZ2File(filename=compressed_file)
1158
+ text_file = io.TextIOWrapper(binary_file, encoding="utf-8", newline="")
1159
+ return sorted({int(row[0]) for row in csv.reader(text_file, delimiter=":")})
1160
+
1161
+
1162
+ def _pairwise(index):
1163
+ starts, ends = itertools.tee(index)
1164
+ _ = next(ends, None)
1165
+ return itertools.zip_longest(starts, ends)
1166
+
1167
+
1168
+ def _extract_content(filepath, start, end):
1169
+ """Extract articles from a single stream of a multistream WikiMedia XML file."""
1170
+ logger.info("generating examples from = %s", filepath)
1171
+ with open(filepath, "rb") as compressed_file:
1172
+ compressed_file.seek(start)
1173
+ compressed_data = compressed_file.read(end - start if end else -1)
1174
+ binary_data = bz2.BZ2Decompressor().decompress(compressed_data)
1175
+ # Enclose within a single root node to avoid ParseError: junk after document element
1176
+ binary_data = b"<mediawiki>" + binary_data + b"</mediawiki>"
1177
+ with io.StringIO(binary_data.decode(encoding="utf-8")) as text_stream:
1178
+ for _, elem in etree.iterparse(text_stream):
1179
+ if not elem.tag.endswith("page"):
1180
+ continue
1181
+ namespace = elem.tag[:-4]
1182
+ ns = elem.find(f"./{namespace}ns").text
1183
+ redirect = elem.find(f"./{namespace}redirect")
1184
+ # Filter pages that are not in the "main" namespace or that are redirects
1185
+ if ns != "0" or redirect is not None:
1186
+ elem.clear()
1187
+ continue
1188
+ id_ = elem.find(f"./{namespace}id").text
1189
+ title = elem.find(f"./{namespace}title").text
1190
+ raw_content = elem.find(f"./{namespace}revision/{namespace}text").text
1191
+ elem.clear()
1192
+ # Filter empty pages
1193
+ if raw_content is None:
1194
+ continue
1195
+ yield id_, title, raw_content
1196
+
1197
+
1198
+ def _clean_content(inputs, language):
1199
+ """Clean raw wikicode to extract text."""
1200
+ id_, title, raw_content = inputs
1201
+ try:
1202
+ text = _parse_and_clean_wikicode(raw_content, parser=mwparserfromhell, language=language)
1203
+ except mwparserfromhell.parser.ParserError as e:
1204
+ logger.error("mwparserfromhell ParseError: %s", e)
1205
+ return
1206
+ if not text:
1207
+ return
1208
+ url = _construct_url(title, language)
1209
+ yield id_, {"id": id_, "url": url, "title": title, "text": text}
1210
+
1211
+
1212
+ def _parse_and_clean_wikicode(raw_content, parser, language):
1213
+ """Strip formatting and unwanted sections from raw page content."""
1214
+ wikicode = parser.parse(raw_content)
1215
+
1216
+ # Filters for magic words that are parser instructions -- e.g., __NOTOC__
1217
+ re_rm_magic = re.compile("__[A-Z]*__", flags=re.UNICODE)
1218
+
1219
+ # Filters for file/image links.
1220
+ media_prefixes = "|".join(["File", "Image", "Media"] + MEDIA_ALIASES.get(language, []))
1221
+ re_rm_wikilink = re.compile(f"^(?:{media_prefixes}):", flags=re.IGNORECASE | re.UNICODE)
1222
+
1223
+ def rm_wikilink(obj):
1224
+ return bool(re_rm_wikilink.match(str(obj.title)))
1225
+
1226
+ # Filters for references and tables
1227
+ def rm_tag(obj):
1228
+ return str(obj.tag) in {"ref", "table"}
1229
+
1230
+ # Leave category links in-place but remove the category prefixes
1231
+ cat_prefixes = "|".join(["Category"] + CAT_ALIASES.get(language, []))
1232
+ re_clean_wikilink = re.compile(f"^(?:{cat_prefixes}):", flags=re.IGNORECASE | re.UNICODE)
1233
+
1234
+ def is_category(obj):
1235
+ return bool(re_clean_wikilink.match(str(obj.title)))
1236
+
1237
+ def clean_wikilink(obj):
1238
+ text = obj.__strip__()
1239
+ text = re.sub(re_clean_wikilink, "", text)
1240
+ obj.text = text
1241
+
1242
+ def try_replace_obj(obj):
1243
+ try:
1244
+ clean_wikilink(obj)
1245
+ except ValueError:
1246
+ # For unknown reasons, objects are sometimes not found.
1247
+ pass
1248
+
1249
+ def try_remove_obj(obj, section):
1250
+ try:
1251
+ section.remove(obj)
1252
+ except ValueError:
1253
+ # For unknown reasons, objects are sometimes not found.
1254
+ pass
1255
+
1256
+ section_text = []
1257
+ # Filter individual sections to clean.
1258
+ for section in wikicode.get_sections(flat=True, include_lead=True, include_headings=True):
1259
+ for obj in section.ifilter_wikilinks(recursive=True):
1260
+ if rm_wikilink(obj):
1261
+ try_remove_obj(obj, section)
1262
+ elif is_category(obj):
1263
+ try_replace_obj(obj)
1264
+ for obj in section.ifilter_tags(matches=rm_tag, recursive=True):
1265
+ try_remove_obj(obj, section)
1266
+
1267
+ section_text.append(re.sub(re_rm_magic, "", section.strip_code().strip()))
1268
+ return "\n\n".join(section_text)
1269
+
1270
+
1271
+ def _construct_url(title, language):
1272
+ # See: https://meta.wikimedia.org/wiki/Help:URL
1273
+ return f"https://{language}.wikipedia.org/wiki/{quote(title)}"
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datatrove[io]==0.6.0
2
+ gradio==5.49.1
3
+ mwparserfromhell==0.7.2
4
+ datasets==4.0.0