Spaces:

philipp-zettl
/

qa-generator

Sleeping

App Files Files Community

merge master into main

by philipp-zettl - opened Jun 11, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-838

Files changed (8) hide show

README.md +5 -6
app.py +0 -430
optimization.py +0 -66
requirements.txt +0 -10
src/__init__.py +0 -0
src/optimization.py +0 -66
src/text.py +0 -130
text.py +0 -130

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
-title: QA/FAQ Generator
-emoji: 📈
-colorFrom: gray
 colorTo: green
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
-pinned: true
 license: apache-2.0
-short_description: Generates Questions and Answers from given text content.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Qa Generator
+emoji: 👁
+colorFrom: green
 colorTo: green
 sdk: gradio
 sdk_version: 4.36.1
 app_file: app.py
+pinned: false
 license: apache-2.0
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py DELETED Viewed

@@ -1,430 +0,0 @@
-import gradio as gr
-import torch
-import itertools
-import pandas as pd
-import spaces
-import random
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
-from sklearn.metrics import pairwise_distances
-from collections import Counter
-from itertools import chain
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-import math
-import markdown
-from src.text import doctree_from_url, get_selectors_for_class, split_by_heading, DocTree
-from src.optimization import ngrams, count_ngrams, self_bleu, dist_n, perplexity, js_divergence
-model_name = 'philipp-zettl/t5-small-long-qa'
-qa_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-model_name = 'philipp-zettl/t5-small-qg'
-qg_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')
-embedding_model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
-embedding_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
-# Move only the student model to GPU if available
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-qa_model = qa_model.to(device)
-qg_model = qg_model.to(device)
-embedding_model = embedding_model.to(device)
-max_questions = 1
-max_answers = 1
-max_elem_value = 100
-def embedding_similarity(inputs, outputs):
-    global embedding_model, embedding_tokenizer, device
-    def embed(texts):
-        inputs = embedding_tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
-        with torch.no_grad():
-            outputs = embedding_model(**inputs)
-        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
-    input_embeddings = embed(inputs)
-    output_embeddings = embed(outputs)
-    similarities = pairwise_distances(input_embeddings, output_embeddings, metric='cosine')
-    return sum(similarities) / len(similarities)
-def evaluate_model(num_beams, num_beam_groups, model, tokenizer, eval_data, max_length=85):
-    generated_outputs = []
-    for input_text in eval_data:
-        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
-        outputs = model.generate(
-            input_ids,
-            num_beams=num_beams,
-            num_beam_groups=num_beam_groups,
-            diversity_penalty=1.0,
-            max_new_tokens=max_length,
-        )
-        decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        generated_outputs.append(decoded_text.split())
-    # Self-BLEU for diversity
-    diversity_score = self_bleu(generated_outputs)
-    # Dist-1 and Dist-2 for diversity
-    dist1 = dist_n(generated_outputs, 1)
-    dist2 = dist_n(generated_outputs, 2)
-    # Perplexity for fluency and relevance
-    fluency_score = perplexity(model, tokenizer, [" ".join(output) for output in generated_outputs])
-    # Embedding similarity for contextual relevance
-    contextual_score = embedding_similarity(eval_data, [" ".join(output) for output in generated_outputs])
-    # Jensen-Shannon Divergence for distribution similarity
-    generated_ngrams = count_ngrams(list(chain(*generated_outputs)), 4)
-    reference_ngrams = count_ngrams(list(chain(*[tokenizer.tokenize(text) for text in eval_data])), 4)
-    all_ngrams = set(generated_ngrams.keys()).union(set(reference_ngrams.keys()))
-    p = [generated_ngrams[ngram] for ngram in all_ngrams]
-    q = [reference_ngrams[ngram] for ngram in all_ngrams]
-    jsd_score = js_divergence(p, q)
-    return {
-        "diversity_score": diversity_score,
-        "dist1": dist1,
-        "dist2": dist2,
-        "fluency_score": fluency_score,
-        "contextual_score": contextual_score,
-        "jsd_score": jsd_score
-}
-def find_best_parameters(eval_data, model, tokenizer, max_length=85):
-    # Parameter ranges
-    parameter_map = {
-        2: [2],
-        4: [2],
-        6: [2], # 6x3 == 4x2
-        8: [2], # 8x4 == 6x3 == 4x2
-        9: [3],
-        10: [2], # 10x5 == 8x4 == 6x3 == 4x2
-    }
-    # Find the best parameters
-    best_score = -float('inf')
-    best_params = None
-    for num_beams in parameter_map.keys():
-        for num_beam_groups in parameter_map[num_beams]:
-            if num_beam_groups > num_beams:
-                continue  # num_beam_groups should not be greater than num_beams
-            scores = evaluate_model(num_beams, num_beam_groups, model, tokenizer, eval_data, max_length=max_length)
-            # Combine scores to determine the best parameters
-            combined_score = (scores['dist1'] + scores['dist2'] - scores['fluency_score'] + scores['contextual_score'] - scores['jsd_score']).mean()
-            print(f"num_beams={num_beams}, num_beam_groups={num_beam_groups}, avg combined score={combined_score}")
-            if combined_score > best_score:
-                best_score = combined_score
-                best_params = (num_beams, num_beam_groups)
-    print(f"Best parameters: num_beams={best_params[0]}, num_beam_groups={best_params[1]} with combined score={best_score}")
-    return best_params
-def run_model(inputs, tokenizer, model, num_beams=2, num_beam_groups=2, temperature=0.5, num_return_sequences=1, max_length=85, seed=42069):
-    all_outputs = []
-    torch.manual_seed(seed)
-    for input_text in inputs:
-        model_inputs = tokenizer([input_text], max_length=512, padding=True, truncation=True)
-        input_ids = torch.tensor(model_inputs['input_ids']).to(device)
-        for sample in input_ids:
-            sample_outputs = []
-            with torch.no_grad():
-                sample_output = model.generate(
-                    input_ids[:1],
-                    max_length=max_length,
-                    num_return_sequences=num_return_sequences,
-                    low_memory=True,
-                    use_cache=True,
-                    # Diverse Beam search decoding
-                    num_beams=max(2, num_return_sequences),
-                    num_beam_groups=max(2, num_return_sequences),
-                    diversity_penalty=temperature,
-                )
-                for i, sample_output in enumerate(sample_output):
-                    sample_output = sample_output.unsqueeze(0)
-                    sample_output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
-                    sample_outputs.append(sample_output)
-            all_outputs.append(sample_outputs)
-    return all_outputs
-@spaces.GPU
-def gen(content, temperature_qg=0.5, temperature_qa=0.75, num_return_sequences_qg=1, num_return_sequences_qa=1, max_length=85, seed=42069, optimize_questions=False):
-    inputs = [
-        f'context: {content}'
-    ]
-    question = run_model(
-        inputs,
-        tokenizer,
-        qg_model,
-        num_beams=num_return_sequences_qg,
-        num_beam_groups=num_return_sequences_qg,
-        temperature=temperature_qg,
-        num_return_sequences=num_return_sequences_qg,
-        max_length=max_length,
-        seed=seed
-    )
-    if optimize_questions:
-        q_params = find_best_parameters(
-            list(chain.from_iterable(question)), qg_model, tokenizer, max_length=max_length
-        )
-        question = run_model(
-            inputs,
-            tokenizer,
-            qg_model,
-            num_beams=q_params[0],
-            num_beam_groups=q_params[1],
-            temperature=temperature_qg,
-            num_return_sequences=num_return_sequences_qg,
-            max_length=max_length,
-            seed=seed
-        )
-    inputs = list(chain.from_iterable([
-        [f'question: {q} context: {content}' for q in q_set] for q_set in question
-    ]))
-    answer = run_model(
-        inputs,
-        tokenizer,
-        qa_model,
-        num_beams=num_return_sequences_qa,
-        num_beam_groups=num_return_sequences_qa,
-        temperature=temperature_qa,
-        num_return_sequences=num_return_sequences_qa,
-        max_length=max_length,
-        seed=seed
-    )
-    questions = list(chain.from_iterable(question))
-    answers = list(chain.from_iterable(answer))
-    results = []
-    for idx, ans in enumerate(answers):
-        results.append({'question': questions[idx % num_return_sequences_qg], 'answer': ans})
-    return results
-def variable_outputs(k, max_elems=10):
-    global max_elem_value
-    k = int(k)
-    return [gr.Text(visible=True)] * k + [gr.Text(visible=False)] * (max(max_elems, max_elem_value)- k)
-def set_outputs(content, max_elems=10):
-    c = eval(content)
-    print('received content: ', c)
-    return [gr.Text(value=t, visible=True) for t in c] + [gr.Text(visible=False)] * (max(max_elems, 10) - len(c))
-def create_file_download(qnas):
-    with open('qnas.tsv', 'w') as f:
-        for idx, qna in qnas.iterrows():
-            f.write(qna['Question'] + '\t' + qna['Answer'])
-            if idx < len(qnas) - 1:
-                f.write('\n')
-    return 'qnas.tsv'
-def main():
-    with gr.Tab(label='QA Generator'):
-        with gr.Tab(label='Explanation'):
-            gr.Markdown(
-                '''
-                # QA Generator
-                This tab allows you to generate questions and answers from a given piece of text content.
-                ## How to use
-                1. Enter the text content you want to generate questions and answers from.
-                2. Adjust the diversity penalty for question generation and answer generation.
-                3. Set the maximum length of the generated questions and answers.
-                4. Choose the number of questions and answers you want to generate.
-                5. Click on the "Generate" button.
-                The next section will give you insights into the generated questions and answers.
-                If you're satisfied with the generated questions and answers, you can download them as a TSV file.
-                '''
-            )
-            with gr.Accordion(label='Optimization', open=False):
-                gr.Markdown("""
-                For optimization of the question generation we apply the following combined score:
-                $$\\text{combined} = \\text{dist1} + \\text{dist2} - \\text{fluency} + \\text{contextual} - \\text{jsd}$$
-                Here's a brief explanation of each component:
-                1. **dist1 and dist2**: These represent the diversity of the generated outputs. dist1 measures the ratio of unique unigrams to total unigrams, and dist2 measures the ratio of unique bigrams to total bigrams. <u>**Higher values indicate more diverse outputs.**</u>
-                2. **fluency**: This is the perplexity of the generated outputs, which measures how well the outputs match the language model's expectations. <u>**Lower values indicate better fluency.**</u>
-                3. **contextual**: This measures the similarity between the input and generated outputs using embedding similarity. <u>**Higher values indicate better contextual relevance.**</u>
-                4. **jsd**: This is the Jensen-Shannon Divergence between the n-gram distributions of the generated outputs and the reference data. <u>**Lower values indicate greater similarity between distributions.**</u>
-                """, latex_delimiters=[{'display': False, 'left': '$$', 'right': '$$'}])
-        with gr.Tab(label='Generate QA'):
-            with gr.Row(equal_height=True):
-                with gr.Group("Content"):
-                    content = gr.Textbox(label='Content', lines=15, placeholder='Enter text here', max_lines=10_000)
-                with gr.Group("Settings"):
-                    temperature_qg = gr.Slider(label='Diversity Penalty QG', value=0.2, minimum=0, maximum=1, step=0.01)
-                    temperature_qa = gr.Slider(label='Diversity Penalty QA', value=0.5, minimum=0, maximum=1, step=0.01)
-                    max_length = gr.Number(label='Max Length', value=85, minimum=1, step=1, maximum=512)
-                    num_return_sequences_qg = gr.Number(label='Number Questions', value=max_questions, minimum=1, step=1, maximum=max(max_questions, max_elem_value))
-                    num_return_sequences_qa = gr.Number(label="Number Answers", value=max_answers, minimum=1, step=1, maximum=max(max_questions, max_elem_value))
-                    seed = gr.Number(label="seed", value=42069)
-                    optimize_questions = gr.Checkbox(label="Optimize questions?", value=False)
-            with gr.Row():
-                gen_btn = gr.Button("Generate")
-            @gr.render(
-                inputs=[
-                    content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa,
-                    max_length, seed, optimize_questions
-                ],
-                triggers=[gen_btn.click]
-            )
-            def render_results(content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa, max_length, seed, optimize_questions):
-                if not content.strip():
-                    raise gr.Error('Please enter some content to generate questions and answers.')
-                qnas = gen(
-                    content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa,
-                    max_length, seed, optimize_questions
-                )
-                df = gr.Dataframe(
-                    value=[u.values() for u in qnas],
-                    headers=['Question', 'Answer'],
-                    col_count=2,
-                    wrap=True
-                )
-                pd_df = pd.DataFrame([u.values() for u in qnas], columns=['Question', 'Answer'])
-                download = gr.DownloadButton(label='Download (without headers)', value=create_file_download(pd_df))
-            content.change(lambda x: x.strip(), content)
-def new_main():
-    with gr.Tab('Content extraction from URL'):
-        with gr.Tab(label='Explanation'):
-            gr.Markdown(
-                '''
-                # Content extraction from URL
-                This tab allows you to extract content from a URL and chunk it into sections.
-                ## How to use
-                1. Enter the URL of the webpage you want to extract content from.
-                2. Select the element class and class name of the HTML element you want to extract content from.
-                3. Click on the "Extract content" button.
-                The next section will give you insights into the extracted content.
-                This was done to give you the possibility to look at the extracted content, as well as manipulate it further.
-                Once you extract the content, you can choose the depth level to chunk the content into sections.
-                1. Enter the depth level you want to chunk the content into. **Note: <u>This is based on the HTML structure of the webpage, we're utilizing heading tags for this purpose</u>**
-                2. Click on the "Chunk content" button.
-                '''
-            )
-        with gr.Tab(label='Extract content'):
-            url = gr.Textbox(label='URL', placeholder='Enter URL here', lines=1, max_lines=1)
-            elem_class = gr.Dropdown(label='CSS element class', choices=['div', 'p', 'span', 'main', 'body', 'section', 'main'], value='div')
-            class_name = gr.Dropdown(label='CSS class name', choices=[], allow_custom_value=True)
-            extract_btn = gr.Button('Extract content')
-            with gr.Group():
-                content_state = gr.State(None)
-                final_content = gr.Textbox(value='', show_copy_button=True, label='Final content', interactive=True)
-                with gr.Accordion('Reveal original input', open=False):
-                    og_content = gr.Textbox(value='', label='OG HTML content')
-            with gr.Group(visible=False) as step_2_group:
-                depth_level = gr.Number(label='Depth level', value=1, minimum=0, step=1, maximum=6)
-                continue_btn = gr.Button('Chunk content')
-            def render_results(url, elem_class_, class_name_):
-                if not url.strip():
-                    raise gr.Error('Please enter a URL to extract content.')
-                content = doctree_from_url(url, elem_class_, class_name_)
-                return [
-                    content,
-                    content.content,
-                    content.as_markdown(content.merge_sections(content.get_sections(0))),
-                    gr.Group(visible=True)
-                ]
-            def get_class_options(url, elem_class):
-                if not url.strip():
-                    raise gr.Error('Please enter a URL to extract content.')
-                return gr.Dropdown(label='CSS class name', choices=list(set(get_selectors_for_class(url, elem_class))))
-            def update_content_state_on_final_change(final_content):
-                html_content = markdown.markdown(final_content)
-                return DocTree(split_by_heading(html_content, 1))
-            @gr.render(inputs=[content_state, depth_level], triggers=[continue_btn.click])
-            def select_content(content, depth_level):
-                if not content:
-                    raise gr.Error('Please extract content first.')
-                sections = content.get_sections_by_depth(depth_level)
-                print(f'Found {len(sections)} sections')
-                ds = []
-                for idx, section in enumerate(sections):
-                    ds.append([idx, content.as_markdown(content.merge_sections(section))])
-                gr.Dataframe(value=ds, headers=['Section #', 'Content'], interactive=True, wrap=True)
-            elem_class.change(
-                get_class_options,
-                inputs=[url, elem_class],
-                outputs=[class_name]
-            )
-            extract_btn.click(
-                render_results,
-                inputs=[
-                    url, elem_class, class_name,
-                ],
-                outputs=[
-                    content_state, og_content, final_content, step_2_group
-                ]
-            )
-            final_content.change(update_content_state_on_final_change, inputs=[final_content], outputs=[content_state])
-with gr.Blocks() as demo:
-    gr.Markdown(
-        '''
-        # QA-Generator
-        A tool to build FAQs or QnAs from a given piece of text content.
-        ## How to use
-        We provide you two major functionalities:
-        1. **Content extraction from URL**: Extract content from a URL and chunk it into sections.
-        2. **QA Generator**: Generate questions and answers from a given text content.
-        Select the tab you want to use and follow the instructions.
-        '''
-    )
-    new_main()
-    main()
-demo.queue()
-demo.launch()

optimization.py DELETED Viewed

@@ -1,66 +0,0 @@
-from collections import Counter
-from itertools import chain
-import math
-import torch
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-def ngrams(sequence, n):
-    return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)]
-def count_ngrams(sequence, max_n):
-    counts = Counter()
-    for n in range(1, max_n + 1):
-        counts.update(ngrams(sequence, n))
-    return counts
-def self_bleu(outputs):
-    smoothing_function = SmoothingFunction().method1
-    scores = []
-    for i in range(len(outputs)):
-        references = outputs[:i] + outputs[i+1:]
-        # Avoid calculating BLEU score for empty references
-        if references:
-            scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function))
-    # If all references are empty, return a default value
-    if not scores:
-        return 0
-    return sum(scores) / len(scores)
-def dist_n(outputs, n):
-    all_ngrams = list(chain(*[ngrams(output, n) for output in outputs]))
-    unique_ngrams = set(all_ngrams)
-    return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0
-def perplexity(model, tokenizer, texts):
-    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
-    max_length = model.config.n_positions
-    stride = 512
-    lls = []
-    for i in range(0, encodings.input_ids.size(1), stride):
-        begin_loc = max(i + stride - max_length, 0)
-        end_loc = i + stride
-        trg_len = end_loc - i
-        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-        with torch.no_grad():
-            outputs = model(input_ids, labels=target_ids)
-            log_likelihood = outputs.loss * trg_len
-        lls.append(log_likelihood)
-    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
-    return ppl.item()
-def js_divergence(p, q):
-    def kl_divergence(p, q):
-        return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0)
-    p_norm = [float(i)/sum(p) for i in p]
-    q_norm = [float(i)/sum(q) for i in q]
-    m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))]
-    return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2

requirements.txt DELETED Viewed

@@ -1,10 +0,0 @@
-transformers
-torch
-pandas
-scikit-learn
-nltk
-markdownify
-beautifulsoup4
-newspaper3k
-markdown
-lxml[html_clean]

src/__init__.py DELETED Viewed

File without changes

src/optimization.py DELETED Viewed

@@ -1,66 +0,0 @@
-from collections import Counter
-from itertools import chain
-import math
-import torch
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-def ngrams(sequence, n):
-    return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)]
-def count_ngrams(sequence, max_n):
-    counts = Counter()
-    for n in range(1, max_n + 1):
-        counts.update(ngrams(sequence, n))
-    return counts
-def self_bleu(outputs):
-    smoothing_function = SmoothingFunction().method1
-    scores = []
-    for i in range(len(outputs)):
-        references = outputs[:i] + outputs[i+1:]
-        # Avoid calculating BLEU score for empty references
-        if references:
-            scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function))
-    # If all references are empty, return a default value
-    if not scores:
-        return 0
-    return sum(scores) / len(scores)
-def dist_n(outputs, n):
-    all_ngrams = list(chain(*[ngrams(output, n) for output in outputs]))
-    unique_ngrams = set(all_ngrams)
-    return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0
-def perplexity(model, tokenizer, texts):
-    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
-    max_length = model.config.n_positions
-    stride = 512
-    lls = []
-    for i in range(0, encodings.input_ids.size(1), stride):
-        begin_loc = max(i + stride - max_length, 0)
-        end_loc = i + stride
-        trg_len = end_loc - i
-        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
-        target_ids = input_ids.clone()
-        target_ids[:, :-trg_len] = -100
-        with torch.no_grad():
-            outputs = model(input_ids, labels=target_ids)
-            log_likelihood = outputs.loss * trg_len
-        lls.append(log_likelihood)
-    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
-    return ppl.item()
-def js_divergence(p, q):
-    def kl_divergence(p, q):
-        return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0)
-    p_norm = [float(i)/sum(p) for i in p]
-    q_norm = [float(i)/sum(q) for i in q]
-    m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))]
-    return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2

src/text.py DELETED Viewed

@@ -1,130 +0,0 @@
-from markdownify import markdownify as md
-from bs4 import BeautifulSoup as BS
-from urllib.parse import urljoin
-from newspaper import Article
-import re
-import markdown
-def clean(s):
-    s = s.replace("\t", "\\t")
-    s = s.replace("\n", "\\n")
-    return s
-class DocTree:
-    def __init__(self, content):
-        self.content = content
-        self.max_depth = 6
-    def get_sections(self, *location_ids):
-        out = self.content
-        for id_ in location_ids:
-            out = out[id_]
-        return out
-    def merge_sections(self, elems):
-        if not isinstance(elems[0], list):
-            return '\n\n '.join(elems)
-        out = []
-        for e in elems:
-            out.append(self.merge_sections(e))
-        return '\n\n '.join(map(clean, out))
-    def get_merged_sections(self, *location_ids):
-        return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
-    def as_markdown(self, content):
-        return md(content)
-    def get_sections_by_depth(self, depth):
-        return self._get_sections_by_depth(self.content, depth)
-    @staticmethod
-    def _get_sections_by_depth(content, depth):
-        """Returns a list of merged sections at a specific depth"""
-        if depth == 0:
-            return content
-        out = []
-        for elem in content:
-            out += DocTree._get_sections_by_depth(elem, depth - 1)
-        return out
-def fix_relative_links(url, article_content):
-    if 'http' in url:
-        base_url = '/'.join(url.split('/')[:3])
-    else:
-        base_url = url.split('/')
-    pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
-    res = pat.findall(article_content)
-    if res:
-        for g in res:
-            url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
-            article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
-    else:print('not found')
-    return article_content
-def extract_article(url):
-    article = Article(url)
-    article.download()
-    article.parse()
-    return article
-def select_content(html_code, elem_class, class_name):
-    print(f'Calling select_content with {elem_class}, {class_name}')
-    kwargs = {}
-    if class_name.startswith('.'):
-        class_name = class_name[1:]
-        kwargs = {'class_': class_name}
-    elif class_name.startswith('#'):
-        kwargs = {'id': class_name[1:]}
-    return md(str(BS(html_code, features="lxml").find(**kwargs)))
-def split_by_heading(html_content, _i):
-    if _i >= 7:
-        return html_content
-    elems = []
-    for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
-        if idx > 0 or elem.startswith('>'):
-            elem = f'<h{_i}{elem}'
-        elems.append(split_by_heading(elem, _i+1))
-    return elems
-def doctree_from_url(url, elem_class='div', class_name='article-body'):
-    article = extract_article(url)
-    # convert to MD to handle splitting better
-    article_content = select_content(article.html, elem_class, class_name)
-    requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []
-    if requires_title:
-        print('Didn\'t find title, will add it manually...')
-        article_content = f"# {article.title}\n\n{article_content}"
-    article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
-    # fix relative website links
-    article_content = fix_relative_links(url, article_content)
-    # convert back to HTML
-    html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
-    doc_tree = DocTree(split_by_heading(html_content, 1))
-    #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
-    return doc_tree
-def get_selectors_for_class(url, elem_class):
-    article = extract_article(url)
-    html_content = article.html
-    soup = BS(html_content, features="lxml")
-    classes = set()
-    ids = set()
-    for elem in soup.find_all(elem_class):
-        if elem.get('class'):
-            for c in elem.get('class'):
-                classes |= {f".{c}"}
-        if elem.get('id'):
-            ids |= {f"#{elem.get('id')}"}
-    return ids | classes

text.py DELETED Viewed

@@ -1,130 +0,0 @@
-from markdownify import markdownify as md
-from bs4 import BeautifulSoup as BS
-from IPython.display import display, Markdown
-from urllib.parse import urljoin
-from newspaper import Article
-import re
-import markdown
-def clean(s):
-    s = s.replace("\t", "\\t")
-    s = s.replace("\n", "\\n")
-    return s
-class DocTree:
-    def __init__(self, content):
-        self.content = content
-        self.max_depth = 6
-    def get_sections(self, *location_ids):
-        out = self.content
-        for id_ in location_ids:
-            out = out[id_]
-        return out
-    def merge_sections(self, elems):
-        if not isinstance(elems[0], list):
-            return '\n\n '.join(elems)
-        out = []
-        for e in elems:
-            out.append(self.merge_sections(e))
-        return '\n\n '.join(map(clean, out))
-    def get_merged_sections(self, *location_ids):
-        return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
-    def as_markdown(self, content):
-        return md(content)
-    def get_sections_by_depth(self, depth):
-        return self._get_sections_by_depth(self.content, depth)
-    @staticmethod
-    def _get_sections_by_depth(content, depth):
-        """Returns a list of merged sections at a specific depth"""
-        if depth == 0:
-            return content
-        out = []
-        for elem in content:
-            out += DocTree._get_sections_by_depth(elem, depth - 1)
-        return out
-def fix_relative_links(url, article_content):
-    if 'http' in url:
-        base_url = '/'.join(url.split('/')[:3])
-    else:
-        base_url = url.split('/')
-    pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
-    res = pat.findall(article_content)
-    if res:
-        for g in res:
-            url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
-            article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
-    else:print('not found')
-    return article_content
-def extract_article(url):
-    article = Article(url)
-    article.download()
-    article.parse()
-    return article
-def select_content(html_code, elem_class, class_name):
-    print(f'Calling select_content with {elem_class}, {class_name}')
-    if class_name.startswith('.'):
-        class_name = class_name[1:]
-        elem_id = None
-    elif class_name.startswith('#'):
-        elem_id = class_name[1:]
-        class_name = None
-    else:
-        elem_id = None
-        class_name = None
-    return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id)))
-def split_by_heading(html_content, _i):
-    if _i >= 7:
-        return html_content
-    elems = []
-    for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
-        if idx > 0 or elem.startswith('>'):
-            elem = f'<h{_i}{elem}'
-        elems.append(split_by_heading(elem, _i+1))
-    return elems
-def doctree_from_url(url, elem_class='div', class_name='article-body'):
-    article = extract_article(url)
-    # convert to MD to handle splitting better
-    article_content = select_content(article.html, elem_class, class_name)
-    article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
-    # fix relative website links
-    article_content = fix_relative_links(url, article_content)
-    # convert back to HTML
-    html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
-    doc_tree = DocTree(split_by_heading(html_content, 1))
-    #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
-    return doc_tree
-def get_selectors_for_class(url, elem_class):
-    article = extract_article(url)
-    html_content = article.html
-    soup = BS(html_content, features="lxml")
-    classes = set()
-    ids = set()
-    for elem in soup.find_all(elem_class):
-        if elem.get('class'):
-            for c in elem.get('class'):
-                classes |= {f".{c}"}
-        if elem.get('id'):
-            for c in elem.get('id'):
-                ids |= {f"#{c}"}
-    return ids | classes