Spaces:
Sleeping
Sleeping
Upload data_prep.py
Browse files- data_prep.py +7 -11
data_prep.py
CHANGED
|
@@ -82,11 +82,7 @@ def extract_div_contents_from_url(url):
|
|
| 82 |
return df
|
| 83 |
|
| 84 |
|
| 85 |
-
|
| 86 |
-
import pandas as pd
|
| 87 |
-
from bs4 import BeautifulSoup
|
| 88 |
-
|
| 89 |
-
def extract_div_contents_from_url_new(url, date):
|
| 90 |
response = requests.get(url)
|
| 91 |
if response.status_code != 200:
|
| 92 |
print(f"Error: Received status code {response.status_code} for URL: {url}")
|
|
@@ -154,15 +150,15 @@ def extract_div_contents_from_url_new(url, date):
|
|
| 154 |
if confirmation_b_tag:
|
| 155 |
confirmation = confirmation_b_tag.text.strip()
|
| 156 |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
|
| 157 |
-
discussion = parts[
|
| 158 |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
|
| 159 |
|
| 160 |
-
data.append([
|
| 161 |
except Exception as e:
|
| 162 |
print(f"Error processing div: {e}")
|
| 163 |
continue
|
| 164 |
|
| 165 |
-
df = pd.DataFrame(data, columns=[
|
| 166 |
return df
|
| 167 |
|
| 168 |
def extract_post_links_text(discussion_html):
|
|
@@ -206,13 +202,13 @@ def process_split_text_into_sentences(df):
|
|
| 206 |
|
| 207 |
def process_data(url):
|
| 208 |
df = extract_div_contents_from_url(url)
|
| 209 |
-
|
| 210 |
-
if df.empty:
|
| 211 |
df = extract_div_contents_from_url_new(url)
|
|
|
|
| 212 |
df = process_discussion(df)
|
|
|
|
| 213 |
df = process_html_to_plaintext(df)
|
| 214 |
df = process_split_text_into_sentences(df)
|
| 215 |
-
#if not empty
|
| 216 |
if not df.empty:
|
| 217 |
return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
|
| 218 |
else:
|
|
|
|
| 82 |
return df
|
| 83 |
|
| 84 |
|
| 85 |
+
def extract_div_contents_from_url_new(url):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
response = requests.get(url)
|
| 87 |
if response.status_code != 200:
|
| 88 |
print(f"Error: Received status code {response.status_code} for URL: {url}")
|
|
|
|
| 150 |
if confirmation_b_tag:
|
| 151 |
confirmation = confirmation_b_tag.text.strip()
|
| 152 |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
|
| 153 |
+
discussion = parts[0] if len(parts) > 0 else ''
|
| 154 |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
|
| 155 |
|
| 156 |
+
data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
|
| 157 |
except Exception as e:
|
| 158 |
print(f"Error processing div: {e}")
|
| 159 |
continue
|
| 160 |
|
| 161 |
+
df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
|
| 162 |
return df
|
| 163 |
|
| 164 |
def extract_post_links_text(discussion_html):
|
|
|
|
| 202 |
|
| 203 |
def process_data(url):
|
| 204 |
df = extract_div_contents_from_url(url)
|
| 205 |
+
if df.at[0,'discussion'] == '':
|
|
|
|
| 206 |
df = extract_div_contents_from_url_new(url)
|
| 207 |
+
#print(df.head())
|
| 208 |
df = process_discussion(df)
|
| 209 |
+
print(df.at[0,'discussion'])
|
| 210 |
df = process_html_to_plaintext(df)
|
| 211 |
df = process_split_text_into_sentences(df)
|
|
|
|
| 212 |
if not df.empty:
|
| 213 |
return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
|
| 214 |
else:
|