Spaces:
Sleeping
Sleeping
| /** | |
| * Content Extractor V2 - Enhanced Full Paragraph Extraction | |
| * Extracts ALL paragraphs from article with structure preservation | |
| */ | |
| const ContentExtractorV2 = { | |
| /** | |
| * Extract complete article content with all paragraphs | |
| */ | |
| extractFullContent() { | |
| console.log('π Starting enhanced content extraction...'); | |
| const content = { | |
| title: '', | |
| subtitle: '', | |
| author: '', | |
| publishDate: '', | |
| source: window.location.hostname, | |
| url: window.location.href, | |
| pageType: this.detectPageType(), | |
| contentType: this.detectContentType(), | |
| paragraphs: [], // Array of paragraph objects | |
| images: [], | |
| metadata: {}, | |
| fullText: '', | |
| confidence: 0 | |
| }; | |
| // Extract metadata first | |
| content.metadata = this.extractMetadata(); | |
| content.title = this.extractTitle(); | |
| content.author = content.metadata.author; | |
| content.publishDate = content.metadata.publishDate; | |
| // Extract ALL paragraphs with structure | |
| const paragraphsData = this.extractAllParagraphs(); | |
| content.paragraphs = paragraphsData.paragraphs; | |
| content.fullText = paragraphsData.fullText; | |
| content.images = this.extractImages(); | |
| // Calculate extraction confidence | |
| content.confidence = this.calculateConfidence(content); | |
| console.log('β Extraction complete:', { | |
| title: content.title, | |
| paragraphs: content.paragraphs.length, | |
| characters: content.fullText.length, | |
| pageType: content.pageType, | |
| contentType: content.contentType, | |
| confidence: content.confidence | |
| }); | |
| return content; | |
| }, | |
| /** | |
| * Extract ALL paragraphs from the article | |
| * Returns array of paragraph objects with metadata | |
| */ | |
| extractAllParagraphs() { | |
| console.log('π Extracting all paragraphs...'); | |
| const paragraphs = []; | |
| let fullText = ''; | |
| let paragraphIndex = 0; | |
| // Find the main article container | |
| const articleElement = this.findArticleContainer(); | |
| if (!articleElement) { | |
| console.warn('β οΈ No article container found, using body'); | |
| return this.fallbackExtraction(); | |
| } | |
| console.log('β Found article container:', articleElement.tagName, articleElement.className); | |
| // Extract all paragraph elements | |
| const paragraphElements = articleElement.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote, figcaption'); | |
| console.log(`π Found ${paragraphElements.length} paragraph elements`); | |
| paragraphElements.forEach((element, index) => { | |
| const text = this.cleanText(element.textContent); | |
| // Skip if too short (likely not content) | |
| if (text.length < 20) { | |
| return; | |
| } | |
| // Skip if it's a navigation/ad element | |
| if (this.isNonContentElement(element)) { | |
| return; | |
| } | |
| const paragraph = { | |
| index: paragraphIndex, | |
| type: element.tagName.toLowerCase(), | |
| text: text, | |
| length: text.length, | |
| element_id: element.id || null, | |
| element_class: element.className || null, | |
| xpath: this.getXPath(element), // For precise location | |
| position: { | |
| top: element.offsetTop, | |
| left: element.offsetLeft | |
| } | |
| }; | |
| paragraphs.push(paragraph); | |
| fullText += text + '\n\n'; | |
| paragraphIndex++; | |
| }); | |
| console.log(`β Extracted ${paragraphs.length} valid paragraphs`); | |
| return { | |
| paragraphs: paragraphs, | |
| fullText: fullText.trim() | |
| }; | |
| }, | |
| /** | |
| * Find the main article container element | |
| */ | |
| findArticleContainer() { | |
| // Priority selectors for article content | |
| const selectors = [ | |
| // Semantic HTML | |
| 'article[role="main"]', | |
| 'article[role="article"]', | |
| 'main article', | |
| 'article', | |
| '[role="article"]', | |
| '[role="main"]', | |
| 'main', | |
| // Schema.org | |
| '[itemprop="articleBody"]', | |
| '[itemtype*="Article"]', | |
| // Common class patterns | |
| '.article-body', | |
| '.article-content', | |
| '.article__body', | |
| '.story-body', | |
| '.post-content', | |
| '.entry-content', | |
| '.post-body', | |
| '.content-body', | |
| // News-specific | |
| '.ins_storybody', // NDTV | |
| '.story-body__inner', // BBC | |
| '.article__content', // Guardian | |
| '.article-text', // Hindu | |
| '.single-post', // PageSix, NY Post | |
| '.entry__content', // PageSix | |
| '.single__content', // PageSix | |
| '.post__content', // Various news sites | |
| '.story__content' // Various news sites | |
| ]; | |
| for (const selector of selectors) { | |
| const element = document.querySelector(selector); | |
| if (element && element.textContent.trim().length > 100) { | |
| return element; | |
| } | |
| } | |
| // Fallback: Find largest text container | |
| return this.findLargestTextContainer(); | |
| }, | |
| /** | |
| * Find the element with the most paragraph content | |
| */ | |
| findLargestTextContainer() { | |
| const candidates = document.querySelectorAll('div, section, article'); | |
| let largest = null; | |
| let maxParagraphs = 0; | |
| candidates.forEach(element => { | |
| const paragraphs = element.querySelectorAll('p'); | |
| const textLength = element.textContent.trim().length; | |
| if (paragraphs.length > maxParagraphs && textLength > 500) { | |
| maxParagraphs = paragraphs.length; | |
| largest = element; | |
| } | |
| }); | |
| return largest; | |
| }, | |
| /** | |
| * Check if element is likely non-content (nav, ads, etc.) | |
| */ | |
| isNonContentElement(element) { | |
| const text = element.textContent.toLowerCase(); | |
| const classList = element.className.toLowerCase(); | |
| const id = element.id?.toLowerCase() || ''; | |
| // Check for timestamps (e.g., "2 hours ago", "Published: Jan 10, 2025", "Updated: 3:45 PM") | |
| const timestampPatterns = [ | |
| /\d{1,2}\s*(hours?|mins?|minutes?|seconds?|days?|weeks?|months?|years?)\s*ago/i, | |
| /published:?\s*\d/i, | |
| /updated:?\s*\d/i, | |
| /\d{1,2}:\d{2}\s*(am|pm)/i, | |
| /^\s*\d{1,2}\/\d{1,2}\/\d{2,4}\s*$/, // Date format: 10/18/2025 | |
| /^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2},?\s+\d{4}/i // Jan 10, 2025 | |
| ]; | |
| if (timestampPatterns.some(pattern => pattern.test(text))) { | |
| return true; | |
| } | |
| // Check for navigation/UI text | |
| const navPatterns = [ | |
| 'skip to', 'menu', 'navigation', 'cookie', 'subscribe', | |
| 'newsletter', 'advertisement', 'related articles', 'read more', | |
| 'share this', 'follow us', 'sponsored', 'powered by', 'published', | |
| 'updated', 'posted', 'last modified' | |
| ]; | |
| if (navPatterns.some(pattern => text.includes(pattern))) { | |
| return true; | |
| } | |
| // Check classes/IDs | |
| const excludePatterns = [ | |
| 'nav', 'menu', 'footer', 'header', 'sidebar', 'ad', 'advertisement', | |
| 'related', 'recommend', 'share', 'social', 'comment', 'cookie', | |
| 'timestamp', 'date', 'time', 'publish', 'author', 'byline' | |
| ]; | |
| if (excludePatterns.some(pattern => classList.includes(pattern) || id.includes(pattern))) { | |
| return true; | |
| } | |
| return false; | |
| }, | |
| /** | |
| * Get XPath of element for precise location | |
| */ | |
| getXPath(element) { | |
| if (element.id) { | |
| return `//*[@id="${element.id}"]`; | |
| } | |
| const parts = []; | |
| while (element && element.nodeType === Node.ELEMENT_NODE) { | |
| let index = 0; | |
| let sibling = element.previousSibling; | |
| while (sibling) { | |
| if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName === element.nodeName) { | |
| index++; | |
| } | |
| sibling = sibling.previousSibling; | |
| } | |
| const tagName = element.nodeName.toLowerCase(); | |
| const pathIndex = index > 0 ? `[${index + 1}]` : ''; | |
| parts.unshift(tagName + pathIndex); | |
| element = element.parentNode; | |
| } | |
| return parts.length ? '/' + parts.join('/') : ''; | |
| }, | |
| /** | |
| * Fallback extraction if no article container found | |
| */ | |
| fallbackExtraction() { | |
| console.log('β οΈ Using fallback extraction - extracting ALL <p> tags from page'); | |
| const paragraphs = []; | |
| const allParagraphs = document.querySelectorAll('p'); | |
| let fullText = ''; | |
| console.log(`π Found ${allParagraphs.length} total <p> tags on page`); | |
| allParagraphs.forEach((p, index) => { | |
| const text = this.cleanText(p.textContent); | |
| // β VERY LENIENT: Accept even short paragraphs (15+ chars) | |
| if (text.length < 15) { | |
| return; | |
| } | |
| // β Skip ONLY obvious non-content (but be lenient) | |
| if (this.isNonContentElement(p)) { | |
| console.log(` βοΈ Skipping: "${text.substring(0, 50)}..."`); | |
| return; | |
| } | |
| paragraphs.push({ | |
| index: paragraphs.length, // Use running index, not DOM index | |
| type: 'p', | |
| text: text, | |
| length: text.length, | |
| xpath: this.getXPath(p) | |
| }); | |
| fullText += text + '\n\n'; | |
| }); | |
| console.log(`β Fallback extracted ${paragraphs.length} paragraphs`); | |
| return { paragraphs, fullText }; | |
| }, | |
| /** | |
| * Detect page type (news, blog, social media, etc.) | |
| */ | |
| detectPageType() { | |
| const url = window.location.href.toLowerCase(); | |
| const hostname = window.location.hostname.toLowerCase(); | |
| // News portals | |
| const newsPortals = [ | |
| 'bbc.com', 'cnn.com', 'nytimes.com', 'theguardian.com', 'reuters.com', | |
| 'ndtv.com', 'timesofindia.com', 'hindustantimes.com', 'thehindu.com' | |
| ]; | |
| if (newsPortals.some(domain => hostname.includes(domain))) { | |
| return 'news-article'; | |
| } | |
| // Check for article indicators | |
| if (document.querySelector('article') || document.querySelector('[itemtype*="Article"]')) { | |
| return 'article'; | |
| } | |
| // Blog detection | |
| if (url.includes('blog') || hostname.includes('wordpress') || hostname.includes('blogger')) { | |
| return 'blog-post'; | |
| } | |
| return 'webpage'; | |
| }, | |
| /** | |
| * Detect content type from metadata and structure | |
| */ | |
| detectContentType() { | |
| // Check meta tags | |
| const ogType = document.querySelector('meta[property="og:type"]')?.content; | |
| const articleSection = document.querySelector('meta[property="article:section"]')?.content; | |
| if (ogType === 'article') { | |
| if (articleSection) { | |
| return { | |
| type: 'news-article', | |
| category: articleSection.toLowerCase(), | |
| confidence: 0.9 | |
| }; | |
| } | |
| return { type: 'article', confidence: 0.8 }; | |
| } | |
| // Check for opinion/editorial indicators | |
| const title = document.title.toLowerCase(); | |
| const url = window.location.href.toLowerCase(); | |
| if (title.includes('opinion') || url.includes('opinion') || | |
| title.includes('editorial') || url.includes('editorial')) { | |
| return { type: 'opinion-piece', confidence: 0.85 }; | |
| } | |
| // Check for scientific paper | |
| if (document.querySelector('meta[name="citation_title"]')) { | |
| return { type: 'scientific-paper', confidence: 0.95 }; | |
| } | |
| // Default | |
| return { type: 'general-content', confidence: 0.5 }; | |
| }, | |
| /** | |
| * Extract title from various sources | |
| */ | |
| extractTitle() { | |
| // Try Open Graph | |
| const ogTitle = document.querySelector('meta[property="og:title"]'); | |
| if (ogTitle) return ogTitle.content; | |
| // Try headline element | |
| const h1 = document.querySelector('article h1, .article h1, h1.title, h1.headline, main h1'); | |
| if (h1) return h1.textContent.trim(); | |
| // Fallback to page title | |
| return document.title || 'Untitled'; | |
| }, | |
| /** | |
| * Extract metadata (author, date, etc.) | |
| */ | |
| extractMetadata() { | |
| const metadata = { | |
| author: '', | |
| publishDate: '', | |
| modifiedDate: '', | |
| section: '', | |
| tags: [] | |
| }; | |
| // Author | |
| const authorMeta = document.querySelector('meta[name="author"], meta[property="article:author"]'); | |
| const authorElement = document.querySelector('[rel="author"], .author, .byline, [itemprop="author"]'); | |
| metadata.author = authorMeta?.content || authorElement?.textContent.trim() || ''; | |
| // Publish date | |
| const dateMeta = document.querySelector('meta[property="article:published_time"], meta[name="publish-date"]'); | |
| const dateElement = document.querySelector('time[datetime], .publish-date, [itemprop="datePublished"]'); | |
| metadata.publishDate = dateMeta?.content || dateElement?.getAttribute('datetime') || dateElement?.textContent.trim() || ''; | |
| // Section/category | |
| const sectionMeta = document.querySelector('meta[property="article:section"]'); | |
| metadata.section = sectionMeta?.content || ''; | |
| // Tags | |
| const tagMeta = document.querySelectorAll('meta[property="article:tag"]'); | |
| metadata.tags = Array.from(tagMeta).map(tag => tag.content); | |
| return metadata; | |
| }, | |
| /** | |
| * Extract images from article | |
| */ | |
| extractImages() { | |
| const images = []; | |
| const articleElement = this.findArticleContainer(); | |
| if (!articleElement) return images; | |
| const imgElements = articleElement.querySelectorAll('img'); | |
| imgElements.forEach((img, index) => { | |
| if (img.width > 100 && img.height > 100) { // Skip small images (icons, etc.) | |
| images.push({ | |
| index: index, | |
| src: img.src, | |
| alt: img.alt || '', | |
| caption: img.parentElement.querySelector('figcaption')?.textContent.trim() || '' | |
| }); | |
| } | |
| }); | |
| return images; | |
| }, | |
| /** | |
| * Clean text content | |
| */ | |
| cleanText(text) { | |
| return text | |
| .replace(/\s+/g, ' ') | |
| .replace(/\n{3,}/g, '\n\n') | |
| .trim(); | |
| }, | |
| /** | |
| * Calculate extraction confidence score | |
| */ | |
| calculateConfidence(content) { | |
| let score = 0; | |
| // Has title | |
| if (content.title && content.title.length > 5) score += 20; | |
| // Has paragraphs | |
| if (content.paragraphs.length > 0) score += 20; | |
| if (content.paragraphs.length > 3) score += 10; | |
| if (content.paragraphs.length > 10) score += 10; | |
| // Has metadata | |
| if (content.author) score += 10; | |
| if (content.publishDate) score += 10; | |
| // Has sufficient text | |
| if (content.fullText.length > 500) score += 10; | |
| if (content.fullText.length > 2000) score += 10; | |
| return Math.min(score, 100); | |
| } | |
| }; | |
| // Make it globally available | |
| if (typeof window !== 'undefined') { | |
| window.ContentExtractorV2 = ContentExtractorV2; | |
| } | |