Spaces:

zpsajst
/

linkscout-backend

Sleeping

File size: 14,254 Bytes

2398be6

/**
 * Content Extractor V2 - Enhanced Full Paragraph Extraction
 * Extracts ALL paragraphs from article with structure preservation
 */

const ContentExtractorV2 = {
  /**
   * Extract complete article content with all paragraphs
   */
  extractFullContent() {
    console.log('🔍 Starting enhanced content extraction...');
    
    const content = {
      title: '',
      subtitle: '',
      author: '',
      publishDate: '',
      source: window.location.hostname,
      url: window.location.href,
      pageType: this.detectPageType(),
      contentType: this.detectContentType(),
      paragraphs: [],  // Array of paragraph objects
      images: [],
      metadata: {},
      fullText: '',
      confidence: 0
    };

    // Extract metadata first
    content.metadata = this.extractMetadata();
    content.title = this.extractTitle();
    content.author = content.metadata.author;
    content.publishDate = content.metadata.publishDate;

    // Extract ALL paragraphs with structure
    const paragraphsData = this.extractAllParagraphs();
    content.paragraphs = paragraphsData.paragraphs;
    content.fullText = paragraphsData.fullText;
    content.images = this.extractImages();

    // Calculate extraction confidence
    content.confidence = this.calculateConfidence(content);

    console.log('✅ Extraction complete:', {
      title: content.title,
      paragraphs: content.paragraphs.length,
      characters: content.fullText.length,
      pageType: content.pageType,
      contentType: content.contentType,
      confidence: content.confidence
    });

    return content;
  },

  /**
   * Extract ALL paragraphs from the article
   * Returns array of paragraph objects with metadata
   */
  extractAllParagraphs() {
    console.log('📝 Extracting all paragraphs...');
    
    const paragraphs = [];
    let fullText = '';
    let paragraphIndex = 0;

    // Find the main article container
    const articleElement = this.findArticleContainer();
    
    if (!articleElement) {
      console.warn('⚠️ No article container found, using body');
      return this.fallbackExtraction();
    }

    console.log('✓ Found article container:', articleElement.tagName, articleElement.className);

    // Extract all paragraph elements
    const paragraphElements = articleElement.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote, figcaption');
    
    console.log(`📄 Found ${paragraphElements.length} paragraph elements`);

    paragraphElements.forEach((element, index) => {
      const text = this.cleanText(element.textContent);
      
      // Skip if too short (likely not content)
      if (text.length < 20) {
        return;
      }

      // Skip if it's a navigation/ad element
      if (this.isNonContentElement(element)) {
        return;
      }

      const paragraph = {
        index: paragraphIndex,
        type: element.tagName.toLowerCase(),
        text: text,
        length: text.length,
        element_id: element.id || null,
        element_class: element.className || null,
        xpath: this.getXPath(element),  // For precise location
        position: {
          top: element.offsetTop,
          left: element.offsetLeft
        }
      };

      paragraphs.push(paragraph);
      fullText += text + '\n\n';
      paragraphIndex++;
    });

    console.log(`✅ Extracted ${paragraphs.length} valid paragraphs`);

    return {
      paragraphs: paragraphs,
      fullText: fullText.trim()
    };
  },

  /**
   * Find the main article container element
   */
  findArticleContainer() {
    // Priority selectors for article content
    const selectors = [
      // Semantic HTML
      'article[role="main"]',
      'article[role="article"]',
      'main article',
      'article',
      '[role="article"]',
      '[role="main"]',
      'main',
      
      // Schema.org
      '[itemprop="articleBody"]',
      '[itemtype*="Article"]',
      
      // Common class patterns
      '.article-body',
      '.article-content',
      '.article__body',
      '.story-body',
      '.post-content',
      '.entry-content',
      '.post-body',
      '.content-body',
      
      // News-specific
      '.ins_storybody',  // NDTV
      '.story-body__inner',  // BBC
      '.article__content',  // Guardian
      '.article-text',  // Hindu
      '.single-post',  // PageSix, NY Post
      '.entry__content',  // PageSix
      '.single__content',  // PageSix
      '.post__content',  // Various news sites
      '.story__content'  // Various news sites
    ];

    for (const selector of selectors) {
      const element = document.querySelector(selector);
      if (element && element.textContent.trim().length > 100) {
        return element;
      }
    }

    // Fallback: Find largest text container
    return this.findLargestTextContainer();
  },

  /**
   * Find the element with the most paragraph content
   */
  findLargestTextContainer() {
    const candidates = document.querySelectorAll('div, section, article');
    let largest = null;
    let maxParagraphs = 0;

    candidates.forEach(element => {
      const paragraphs = element.querySelectorAll('p');
      const textLength = element.textContent.trim().length;
      
      if (paragraphs.length > maxParagraphs && textLength > 500) {
        maxParagraphs = paragraphs.length;
        largest = element;
      }
    });

    return largest;
  },

  /**
   * Check if element is likely non-content (nav, ads, etc.)
   */
  isNonContentElement(element) {
    const text = element.textContent.toLowerCase();
    const classList = element.className.toLowerCase();
    const id = element.id?.toLowerCase() || '';

    // Check for timestamps (e.g., "2 hours ago", "Published: Jan 10, 2025", "Updated: 3:45 PM")
    const timestampPatterns = [
      /\d{1,2}\s*(hours?|mins?|minutes?|seconds?|days?|weeks?|months?|years?)\s*ago/i,
      /published:?\s*\d/i,
      /updated:?\s*\d/i,
      /\d{1,2}:\d{2}\s*(am|pm)/i,
      /^\s*\d{1,2}\/\d{1,2}\/\d{2,4}\s*$/,  // Date format: 10/18/2025
      /^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2},?\s+\d{4}/i  // Jan 10, 2025
    ];
    
    if (timestampPatterns.some(pattern => pattern.test(text))) {
      return true;
    }

    // Check for navigation/UI text
    const navPatterns = [
      'skip to', 'menu', 'navigation', 'cookie', 'subscribe',
      'newsletter', 'advertisement', 'related articles', 'read more',
      'share this', 'follow us', 'sponsored', 'powered by', 'published',
      'updated', 'posted', 'last modified'
    ];

    if (navPatterns.some(pattern => text.includes(pattern))) {
      return true;
    }

    // Check classes/IDs
    const excludePatterns = [
      'nav', 'menu', 'footer', 'header', 'sidebar', 'ad', 'advertisement',
      'related', 'recommend', 'share', 'social', 'comment', 'cookie',
      'timestamp', 'date', 'time', 'publish', 'author', 'byline'
    ];

    if (excludePatterns.some(pattern => classList.includes(pattern) || id.includes(pattern))) {
      return true;
    }

    return false;
  },

  /**
   * Get XPath of element for precise location
   */
  getXPath(element) {
    if (element.id) {
      return `//*[@id="${element.id}"]`;
    }
    
    const parts = [];
    while (element && element.nodeType === Node.ELEMENT_NODE) {
      let index = 0;
      let sibling = element.previousSibling;
      
      while (sibling) {
        if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName === element.nodeName) {
          index++;
        }
        sibling = sibling.previousSibling;
      }
      
      const tagName = element.nodeName.toLowerCase();
      const pathIndex = index > 0 ? `[${index + 1}]` : '';
      parts.unshift(tagName + pathIndex);
      element = element.parentNode;
    }
    
    return parts.length ? '/' + parts.join('/') : '';
  },

  /**
   * Fallback extraction if no article container found
   */
  fallbackExtraction() {
    console.log('⚠️ Using fallback extraction - extracting ALL <p> tags from page');
    
    const paragraphs = [];
    const allParagraphs = document.querySelectorAll('p');
    let fullText = '';
    
    console.log(`📄 Found ${allParagraphs.length} total <p> tags on page`);
    
    allParagraphs.forEach((p, index) => {
      const text = this.cleanText(p.textContent);
      
      // ✅ VERY LENIENT: Accept even short paragraphs (15+ chars)
      if (text.length < 15) {
        return;
      }
      
      // ✅ Skip ONLY obvious non-content (but be lenient)
      if (this.isNonContentElement(p)) {
        console.log(`   ⏭️ Skipping: "${text.substring(0, 50)}..."`);
        return;
      }
      
      paragraphs.push({
        index: paragraphs.length,  // Use running index, not DOM index
        type: 'p',
        text: text,
        length: text.length,
        xpath: this.getXPath(p)
      });
      fullText += text + '\n\n';
    });
    
    console.log(`✅ Fallback extracted ${paragraphs.length} paragraphs`);

    return { paragraphs, fullText };
  },

  /**
   * Detect page type (news, blog, social media, etc.)
   */
  detectPageType() {
    const url = window.location.href.toLowerCase();
    const hostname = window.location.hostname.toLowerCase();

    // News portals
    const newsPortals = [
      'bbc.com', 'cnn.com', 'nytimes.com', 'theguardian.com', 'reuters.com',
      'ndtv.com', 'timesofindia.com', 'hindustantimes.com', 'thehindu.com'
    ];

    if (newsPortals.some(domain => hostname.includes(domain))) {
      return 'news-article';
    }

    // Check for article indicators
    if (document.querySelector('article') || document.querySelector('[itemtype*="Article"]')) {
      return 'article';
    }

    // Blog detection
    if (url.includes('blog') || hostname.includes('wordpress') || hostname.includes('blogger')) {
      return 'blog-post';
    }

    return 'webpage';
  },

  /**
   * Detect content type from metadata and structure
   */
  detectContentType() {
    // Check meta tags
    const ogType = document.querySelector('meta[property="og:type"]')?.content;
    const articleSection = document.querySelector('meta[property="article:section"]')?.content;
    
    if (ogType === 'article') {
      if (articleSection) {
        return {
          type: 'news-article',
          category: articleSection.toLowerCase(),
          confidence: 0.9
        };
      }
      return { type: 'article', confidence: 0.8 };
    }

    // Check for opinion/editorial indicators
    const title = document.title.toLowerCase();
    const url = window.location.href.toLowerCase();
    
    if (title.includes('opinion') || url.includes('opinion') || 
        title.includes('editorial') || url.includes('editorial')) {
      return { type: 'opinion-piece', confidence: 0.85 };
    }

    // Check for scientific paper
    if (document.querySelector('meta[name="citation_title"]')) {
      return { type: 'scientific-paper', confidence: 0.95 };
    }

    // Default
    return { type: 'general-content', confidence: 0.5 };
  },

  /**
   * Extract title from various sources
   */
  extractTitle() {
    // Try Open Graph
    const ogTitle = document.querySelector('meta[property="og:title"]');
    if (ogTitle) return ogTitle.content;

    // Try headline element
    const h1 = document.querySelector('article h1, .article h1, h1.title, h1.headline, main h1');
    if (h1) return h1.textContent.trim();

    // Fallback to page title
    return document.title || 'Untitled';
  },

  /**
   * Extract metadata (author, date, etc.)
   */
  extractMetadata() {
    const metadata = {
      author: '',
      publishDate: '',
      modifiedDate: '',
      section: '',
      tags: []
    };

    // Author
    const authorMeta = document.querySelector('meta[name="author"], meta[property="article:author"]');
    const authorElement = document.querySelector('[rel="author"], .author, .byline, [itemprop="author"]');
    metadata.author = authorMeta?.content || authorElement?.textContent.trim() || '';

    // Publish date
    const dateMeta = document.querySelector('meta[property="article:published_time"], meta[name="publish-date"]');
    const dateElement = document.querySelector('time[datetime], .publish-date, [itemprop="datePublished"]');
    metadata.publishDate = dateMeta?.content || dateElement?.getAttribute('datetime') || dateElement?.textContent.trim() || '';

    // Section/category
    const sectionMeta = document.querySelector('meta[property="article:section"]');
    metadata.section = sectionMeta?.content || '';

    // Tags
    const tagMeta = document.querySelectorAll('meta[property="article:tag"]');
    metadata.tags = Array.from(tagMeta).map(tag => tag.content);

    return metadata;
  },

  /**
   * Extract images from article
   */
  extractImages() {
    const images = [];
    const articleElement = this.findArticleContainer();
    
    if (!articleElement) return images;

    const imgElements = articleElement.querySelectorAll('img');
    
    imgElements.forEach((img, index) => {
      if (img.width > 100 && img.height > 100) {  // Skip small images (icons, etc.)
        images.push({
          index: index,
          src: img.src,
          alt: img.alt || '',
          caption: img.parentElement.querySelector('figcaption')?.textContent.trim() || ''
        });
      }
    });

    return images;
  },

  /**
   * Clean text content
   */
  cleanText(text) {
    return text
      .replace(/\s+/g, ' ')
      .replace(/\n{3,}/g, '\n\n')
      .trim();
  },

  /**
   * Calculate extraction confidence score
   */
  calculateConfidence(content) {
    let score = 0;

    // Has title
    if (content.title && content.title.length > 5) score += 20;

    // Has paragraphs
    if (content.paragraphs.length > 0) score += 20;
    if (content.paragraphs.length > 3) score += 10;
    if (content.paragraphs.length > 10) score += 10;

    // Has metadata
    if (content.author) score += 10;
    if (content.publishDate) score += 10;

    // Has sufficient text
    if (content.fullText.length > 500) score += 10;
    if (content.fullText.length > 2000) score += 10;

    return Math.min(score, 100);
  }
};

// Make it globally available
if (typeof window !== 'undefined') {
  window.ContentExtractorV2 = ContentExtractorV2;
}