Spaces:
Sleeping
Sleeping
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| import pysbd | |
| from datetime import datetime, timedelta | |
| def extract_div_contents_with_additional_columns(url, log_date): | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| return pd.DataFrame(columns=['log_date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
| divs = [] | |
| for div_class in div_classes: | |
| divs.extend(soup.find_all('div', class_=div_class)) | |
| url_fragment = url.split('#')[-1].replace('_', ' ') | |
| data = [] | |
| for div in divs: | |
| title_tag = div.find('a') | |
| if title_tag: | |
| title_span = div.find('span', {'data-mw-comment-start': True}) | |
| if title_span: | |
| title_anchor = title_span.find_next_sibling('a') | |
| if title_anchor: | |
| title = title_anchor.text | |
| text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
| else: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| deletion_discussion = div.prettify() | |
| # Extract label | |
| label = '' | |
| verdict_tag = div.find('p') | |
| if verdict_tag: | |
| label_b_tag = verdict_tag.find('b') | |
| if label_b_tag: | |
| label = verdict_tag.prettify() | |
| # Extract confirmation | |
| confirmation = '' | |
| discussion_tag = div.find('dd').find('i') | |
| if discussion_tag: | |
| confirmation_b_tag = discussion_tag.find('b') | |
| if confirmation_b_tag: | |
| confirmation = discussion_tag.prettify() | |
| parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
| discussion = parts[0] if len(parts) > 0 else '' | |
| verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
| data.append([log_date, title, text_url, deletion_discussion, label, confirmation, discussion, verdict]) | |
| df = pd.DataFrame(data, columns=['log_date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
| return df | |
| def extract_div_contents_from_url(url,date): | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error: Received status code {response.status_code} for URL: {url}") | |
| return pd.DataFrame(columns=['date','title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
| divs = [] | |
| for div_class in div_classes: | |
| divs.extend(soup.find_all('div', class_=div_class)) | |
| url_fragment = url.split('#')[-1].replace('_', ' ') | |
| log_date = url.split('/')[-1] | |
| data = [] | |
| for div in divs: | |
| try: | |
| title = None | |
| text_url = None | |
| title_tag = div.find('a') | |
| if title_tag: | |
| title_span = div.find('span', {'data-mw-comment-start': True}) | |
| if title_span: | |
| title_anchor = title_span.find_next_sibling('a') | |
| if title_anchor: | |
| title = title_anchor.text | |
| text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
| else: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if title == 'talk page' or title is None: | |
| heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
| if heading_tag: | |
| title_tag = heading_tag.find('a') | |
| if title_tag: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if not title: | |
| continue | |
| if title.lower() != url_fragment.lower(): | |
| continue | |
| deletion_discussion = div.prettify() | |
| label = '' | |
| verdict_tag = div.find('p') | |
| if verdict_tag: | |
| label_b_tag = verdict_tag.find('b') | |
| if label_b_tag: | |
| label = label_b_tag.text.strip() | |
| confirmation = '' | |
| discussion_tag = div.find('dd') | |
| if discussion_tag: | |
| discussion_tag_i = discussion_tag.find('i') | |
| if discussion_tag_i: | |
| confirmation_b_tag = discussion_tag_i.find('b') | |
| if confirmation_b_tag: | |
| confirmation = confirmation_b_tag.text.strip() | |
| parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
| discussion = parts[0] if len(parts) > 0 else '' | |
| verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
| data.append([date,title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
| except Exception as e: | |
| print(f"Error processing div: {e}") | |
| continue | |
| df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| return df | |
| def extract_div_contents_from_url_new(url,date): | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error: Received status code {response.status_code} for URL: {url}") | |
| return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk',"mw-heading mw-heading3"] | |
| divs = [] | |
| for div_class in div_classes: | |
| divs.extend(soup.find_all('div', class_=div_class)) | |
| url_fragment = url.split('#')[-1].replace('_', ' ') | |
| log_date = url.split('/')[-1] | |
| data = [] | |
| for i, div in enumerate(divs): | |
| try: | |
| title = None | |
| text_url = None | |
| title_tag = div.find('a') | |
| if title_tag: | |
| title_span = div.find('span', {'data-mw-comment-start': True}) | |
| if title_span: | |
| title_anchor = title_span.find_next_sibling('a') | |
| if title_anchor: | |
| title = title_anchor.text | |
| text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
| else: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if title == 'talk page' or title is None: | |
| heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
| if heading_tag: | |
| title_tag = heading_tag.find('a') | |
| if title_tag: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if not title: | |
| continue | |
| if title.lower() != url_fragment.lower(): | |
| continue | |
| next_div = div.find_next('div', class_='mw-heading mw-heading3') | |
| deletion_discussion = '' | |
| sibling = div.find_next_sibling() | |
| while sibling and sibling != next_div: | |
| deletion_discussion += str(sibling) | |
| sibling = sibling.find_next_sibling() | |
| label = '' | |
| verdict_tag = div.find('p') | |
| if verdict_tag: | |
| label_b_tag = verdict_tag.find('b') | |
| if label_b_tag: | |
| label = label_b_tag.text.strip() | |
| confirmation = '' | |
| discussion_tag = div.find('dd') | |
| if discussion_tag: | |
| discussion_tag_i = discussion_tag.find('i') | |
| if discussion_tag_i: | |
| confirmation_b_tag = discussion_tag_i.find('b') | |
| if confirmation_b_tag: | |
| confirmation = confirmation_b_tag.text.strip() | |
| parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
| discussion = parts[0] if len(parts) > 0 else '' | |
| verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
| data.append([date, title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
| except Exception as e: | |
| print(f"Error processing div: {e}") | |
| continue | |
| df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| return df | |
| def extract_label(label_html): | |
| soup = BeautifulSoup(label_html, 'html.parser') | |
| b_tag = soup.find('b') | |
| return b_tag.text.strip() if b_tag else '' | |
| def process_labels(df): | |
| df['proper_label'] = df['label'].apply(extract_label) | |
| return df | |
| def extract_confirmation(confirmation_html): | |
| soup = BeautifulSoup(confirmation_html, 'html.parser') | |
| b_tag = soup.find('span', {'style': 'color:red'}).find('b') | |
| return b_tag.text.strip() if b_tag else '' | |
| def process_confirmations(df): | |
| df['confirmation'] = df['confirmation'].apply(extract_confirmation) | |
| return df | |
| def extract_post_links_text(discussion_html): | |
| split_point = '<span class="plainlinks">' | |
| if split_point in discussion_html: | |
| parts = discussion_html.split(split_point) | |
| if len(parts) > 1: | |
| return parts[1] | |
| return discussion_html | |
| def process_discussion(df): | |
| df['discussion_cleaned'] = df['discussion'].apply(extract_post_links_text) | |
| return df | |
| def html_to_plaintext(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| for tag in soup.find_all(['p', 'li', 'dd', 'dl']): | |
| tag.insert_before('\n') | |
| tag.insert_after('\n') | |
| for br in soup.find_all('br'): | |
| br.replace_with('\n') | |
| text = soup.get_text(separator=' ', strip=True) | |
| text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) | |
| return text | |
| def process_html_to_plaintext(df): | |
| df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext) | |
| return df | |
| def split_text_into_sentences(text): | |
| seg = pysbd.Segmenter(language="en", clean=False) | |
| sentences = seg.segment(text) | |
| return ' '.join(sentences[1:]) | |
| def process_split_text_into_sentences(df): | |
| df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) | |
| return df | |
| def process_data(url,date): | |
| df = extract_div_contents_from_url(url,date) | |
| #print('Discussion: ',df.discussion.tolist()) | |
| if df.discussion.tolist() == []: | |
| #print('Empty Discussion') | |
| df = extract_div_contents_from_url_new(url,date) | |
| #print(df.head()) | |
| df = process_discussion(df) | |
| #print(df.at[0,'discussion']) | |
| df = process_html_to_plaintext(df) | |
| df = process_split_text_into_sentences(df) | |
| if not df.empty: | |
| return df | |
| else: | |
| return 'Empty DataFrame' | |
| def collect_deletion_discussions(start_date, end_date): | |
| base_url = 'https://en.wikipedia.org/wiki/Wikipedia:Articles_for_deletion/Log/' | |
| all_data = pd.DataFrame() | |
| current_date = start_date | |
| while current_date <= end_date: | |
| try: | |
| print(f"Processing {current_date.strftime('%Y-%B-%d')}") | |
| date_str = current_date.strftime('%Y_%B_%d') | |
| url = base_url + date_str | |
| log_date = current_date.strftime('%Y-%m-%d') | |
| df = extract_div_contents_with_additional_columns(url, log_date) | |
| if not df.empty: | |
| df = process_labels(df) | |
| df = process_confirmations(df) | |
| df = process_discussion(df) | |
| df = process_html_to_plaintext(df) | |
| df = process_split_text_into_sentences(df) | |
| all_data = pd.concat([all_data, df], ignore_index=True) | |
| current_date += timedelta(days=1) | |
| except Exception as e: | |
| print(f"Error processing {current_date.strftime('%Y-%B-%d')}: {e}") | |
| current_date += timedelta(days=1) | |
| continue | |
| return all_data | |