Spaces:
Sleeping
Sleeping
| from datetime import datetime | |
| from process_data import prepare_dataset | |
| from datasets import load_dataset | |
| from collect_data_wikidata_ent import collect_wikidata_entity | |
| from collect_data_wikidata_prop import collect_wikidata | |
| from collect_data_wikinews import collect_wikinews | |
| from collect_data_wikiquote import collect_wikiquote | |
| from collect_data_es import collect_es | |
| from collect_data_gr import collect_gr | |
| def normalize_outcome(o): | |
| lowered = o.lower() | |
| if 'διαγρ' in lowered: | |
| return 'Διαγραφή' | |
| elif 'διατήρη' in lowered or 'παραμονή' in lowered: | |
| return 'Διατήρηση' | |
| elif 'συγχών' in lowered: | |
| return 'συγχώνευση' | |
| else: | |
| return 'Δεν υπάρχει συναίνεση' | |
| def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None, | |
| platform=None, lang=None, date=None, years=None): | |
| if mode not in ['date_range', 'date', 'title','url','wide_2023']: | |
| raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']") | |
| if mode == 'wide_2023': | |
| dataset = load_dataset('hsuvaskakoty/wide_analysis') | |
| print('Dataset loaded successfully as huggingface dataset') | |
| print('The dataset has the following columns:', dataset.column_names) | |
| return dataset | |
| underlying_mode = mode | |
| if mode in ['date', 'date_range']: | |
| underlying_mode = 'year' | |
| if mode == 'url': | |
| underlying_mode = 'url' | |
| if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'): | |
| if mode in ['date_range', 'date', 'title']: | |
| return prepare_dataset( | |
| mode=mode, | |
| start_date=start_date, | |
| end_date=end_date, | |
| url=url, | |
| title=title, | |
| output_path=output_path | |
| ) | |
| else: | |
| print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']") | |
| return None | |
| if platform == 'wikidata_entity': | |
| if underlying_mode == 'title': | |
| if not title or (years and len(years)>0): | |
| raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.") | |
| return collect_wikidata_entity(mode='title', title=title, years=[]) | |
| elif underlying_mode == 'year': | |
| if start_date and end_date: | |
| start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year) | |
| return collect_wikidata_entity(mode='year', years=[start_year, end_year]) | |
| elif start_date: | |
| single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| return collect_wikidata_entity(mode='year', years=single_year) | |
| else: | |
| raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.") | |
| elif underlying_mode == 'url': | |
| if not url: | |
| raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.") | |
| return collect_wikidata_entity(mode='url', url=url) | |
| else: | |
| raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.") | |
| elif platform == 'wikidata_property': | |
| if underlying_mode == 'title': | |
| if not title or (years and len(years)>0): | |
| raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.") | |
| return collect_wikidata(mode='title', title=title, years=[]) | |
| elif underlying_mode == 'url': | |
| if not url: | |
| raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.") | |
| return collect_wikidata(mode='url', title='', url=url, years=[]) | |
| elif underlying_mode == 'year': | |
| if start_date and end_date: | |
| start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year) | |
| return collect_wikidata(mode='year', years=[start_year, end_year]) | |
| elif start_date: | |
| single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| return collect_wikidata(mode='year', years=single_year) | |
| else: | |
| raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.") | |
| else: | |
| raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.") | |
| # else: | |
| # raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.") | |
| elif platform == 'wikinews': | |
| if underlying_mode == 'title': | |
| if not title: | |
| raise ValueError("For 'title' mode in wikinews, 'title' is required.") | |
| return collect_wikinews(mode='title', title=title) | |
| elif underlying_mode == 'url': | |
| if not url: | |
| raise ValueError("For 'url' mode in wikinews, 'url' is required.") | |
| return collect_wikinews(mode='url', url=url) | |
| elif underlying_mode == 'year': | |
| if start_date and end_date: | |
| start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year) | |
| return collect_wikinews(mode='year', year=[start_y, end_y]) | |
| elif start_date: | |
| single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| return collect_wikinews(mode='year', year=single_y) | |
| else: | |
| raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.") | |
| else: | |
| raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or 'url'.") | |
| # elif platform == 'wikiquote': | |
| # if underlying_mode != 'title': | |
| # raise ValueError("Wikiquote collection currently only supports 'title' mode.") | |
| # if not title: | |
| # title = 'all' | |
| # return collect_wikiquote(mode='title', title=title) | |
| elif platform == 'wikiquote': | |
| if underlying_mode not in ['title', 'url']: | |
| raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.") | |
| if underlying_mode == 'title': | |
| if not title: | |
| title = 'all' | |
| return collect_wikiquote(mode='title', title=title) | |
| elif underlying_mode == 'url': | |
| if not url: | |
| raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.") | |
| return collect_wikiquote(mode='url', url=url) | |
| elif platform == 'wikipedia': | |
| if lang == 'es': | |
| if underlying_mode == 'title': | |
| if not title or date: | |
| raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.") | |
| return collect_es(mode='title', title=title, date='') | |
| elif underlying_mode == 'year': | |
| if not date: | |
| raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.") | |
| return collect_es(mode='year', title='', date=date) | |
| else: | |
| raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.") | |
| elif lang == 'gr': | |
| if underlying_mode == 'title': | |
| if not title or not years or len(years) != 1: | |
| raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.") | |
| return collect_gr(mode='title', title=title, years=years) | |
| elif underlying_mode == 'year': | |
| if start_date and end_date: | |
| start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year) | |
| return collect_gr(mode='year', title='', years=[start_y,end_y]) | |
| elif start_date: | |
| single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year) | |
| return collect_gr(mode='year', title='', years=[single_y]) | |
| else: | |
| raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.") | |
| else: | |
| raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.") | |
| else: | |
| raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.") | |
| else: | |
| raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.") |