Spaces:
Sleeping
Sleeping
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| def extract_div_contents_from_url(url): | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error: Received status code {response.status_code} for URL: {url}") | |
| return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| div_classes = ["mw-heading mw-heading3",'boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
| divs = [] | |
| for div_class in div_classes: | |
| divs.extend(soup.find_all('div', class_=div_class)) | |
| url_fragment = url.split('#')[-1].replace('_', ' ') | |
| data = [] | |
| for div in divs: | |
| try: | |
| title = None | |
| text_url = None | |
| # Extract title and text_url | |
| title_tag = div.find('a') | |
| if title_tag: | |
| title_span = div.find('span', {'data-mw-comment-start': True}) | |
| if title_span: | |
| title_anchor = title_span.find_next_sibling('a') | |
| if title_anchor: | |
| title = title_anchor.text | |
| text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
| else: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if title == 'talk page' or title is None: | |
| heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
| if heading_tag: | |
| title_tag = heading_tag.find('a') | |
| if title_tag: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if not title: | |
| continue | |
| if title.lower() != url_fragment.lower(): | |
| continue | |
| deletion_discussion = div.prettify() | |
| # Extract label | |
| label = '' | |
| verdict_tag = div.find('p') | |
| if verdict_tag: | |
| label_b_tag = verdict_tag.find('b') | |
| if label_b_tag: | |
| label = label_b_tag.text.strip() | |
| # Extract confirmation | |
| confirmation = '' | |
| discussion_tag = div.find('dd') | |
| if discussion_tag: | |
| discussion_tag_i = discussion_tag.find('i') | |
| if discussion_tag_i: | |
| confirmation_b_tag = discussion_tag_i.find('b') | |
| if confirmation_b_tag: | |
| confirmation = confirmation_b_tag.text.strip() | |
| # Split deletion_discussion into discussion and verdict | |
| parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
| discussion = parts[0] if len(parts) > 0 else '' | |
| verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
| data.append([title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
| except Exception as e: | |
| print(f"Error processing div: {e}") | |
| continue | |
| df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
| df = df[['title', 'discussion', 'verdict', 'label']] | |
| print(f"DataFrame created with {len(df)} rows") | |
| return df | |
| def extract_div_contents_from_url_new(url): | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error: Received status code {response.status_code} for URL: {url}") | |
| return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| div_classes = ["mw-heading mw-heading3"] | |
| divs = [] | |
| for div_class in div_classes: | |
| divs.extend(soup.find_all('div', class_=div_class)) | |
| url_fragment = url.split('#')[-1].replace('_', ' ') | |
| log_date = url.split('/')[-1] | |
| data = [] | |
| for i, div in enumerate(divs): | |
| try: | |
| title = None | |
| text_url = None | |
| title_tag = div.find('a') | |
| if title_tag: | |
| title_span = div.find('span', {'data-mw-comment-start': True}) | |
| if title_span: | |
| title_anchor = title_span.find_next_sibling('a') | |
| if title_anchor: | |
| title = title_anchor.text | |
| text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
| else: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if title == 'talk page' or title is None: | |
| heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
| if heading_tag: | |
| title_tag = heading_tag.find('a') | |
| if title_tag: | |
| title = title_tag.text | |
| text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
| if not title: | |
| continue | |
| if title.lower() != url_fragment.lower(): | |
| continue | |
| next_div = div.find_next('div', class_='mw-heading mw-heading3') | |
| deletion_discussion = '' | |
| sibling = div.find_next_sibling() | |
| while sibling and sibling != next_div: | |
| deletion_discussion += str(sibling) | |
| sibling = sibling.find_next_sibling() | |
| label = '' | |
| verdict_tag = div.find('p') | |
| if verdict_tag: | |
| label_b_tag = verdict_tag.find('b') | |
| if label_b_tag: | |
| label = label_b_tag.text.strip() | |
| confirmation = '' | |
| discussion_tag = div.find('dd') | |
| if discussion_tag: | |
| discussion_tag_i = discussion_tag.find('i') | |
| if discussion_tag_i: | |
| confirmation_b_tag = discussion_tag_i.find('b') | |
| if confirmation_b_tag: | |
| confirmation = confirmation_b_tag.text.strip() | |
| parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
| discussion = parts[0] if len(parts) > 0 else '' | |
| verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
| data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
| except Exception as e: | |
| print(f"Error processing div: {e}") | |
| continue | |
| df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
| return df | |
| def extract_post_links_text(discussion_html): | |
| split_point = '<span class="plainlinks">' | |
| if split_point in discussion_html: | |
| parts = discussion_html.split(split_point) | |
| if len(parts) > 1: | |
| return parts[1] | |
| return discussion_html | |
| def process_discussion(df): | |
| df['discussion_cleaned'] = df['verdict'].apply(extract_post_links_text) | |
| return df | |
| def html_to_plaintext(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| for tag in soup.find_all(['p', 'li', 'dd', 'dl']): | |
| tag.insert_before('\n') | |
| tag.insert_after('\n') | |
| for br in soup.find_all('br'): | |
| br.replace_with('\n') | |
| text = soup.get_text(separator=' ', strip=True) | |
| text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) | |
| return text | |
| def process_html_to_plaintext(df): | |
| df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext) | |
| df = df[['title', 'discussion_cleaned', 'label']] | |
| return df | |
| import pysbd | |
| def split_text_into_sentences(text): | |
| seg = pysbd.Segmenter(language="en", clean=False) | |
| sentences = seg.segment(text) | |
| return ' '.join(sentences[1:]) | |
| def process_split_text_into_sentences(df): | |
| df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) | |
| return df | |
| def process_data(url): | |
| df = extract_div_contents_from_url(url) | |
| if df.at[0,'discussion'] == '': | |
| df = extract_div_contents_from_url_new(url) | |
| #print(df.head()) | |
| df = process_discussion(df) | |
| print(df.at[0,'discussion']) | |
| df = process_html_to_plaintext(df) | |
| df = process_split_text_into_sentences(df) | |
| if not df.empty: | |
| return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned'] | |
| else: | |
| return 'Empty DataFrame' | |