Spaces:
Running
Running
| /** | |
| * Content Extractor - Enhanced intelligent extraction with ad blocking | |
| * Filters out advertisements, identifies page types, extracts multiple articles | |
| */ | |
| const ContentExtractor = { | |
| /** | |
| * Extract main content from current page | |
| */ | |
| extractMainContent() { | |
| const content = { | |
| text: '', | |
| title: '', | |
| author: '', | |
| publishDate: '', | |
| source: window.location.hostname, | |
| pageType: this.detectPageType(), | |
| articles: [], // Multiple articles if detected | |
| confidence: 0, | |
| adBlockedCount: 0 | |
| }; | |
| // Extract title | |
| content.title = this.extractTitle(); | |
| // Extract metadata | |
| const metadata = this.extractMetadata(); | |
| content.author = metadata.author; | |
| content.publishDate = metadata.publishDate; | |
| // Extract main text (with ad filtering) | |
| const extractionResult = this.extractText(); | |
| content.text = extractionResult.text; | |
| content.adBlockedCount = extractionResult.adsBlocked; | |
| // FALLBACK: If cleaning removed everything, try minimal cleaning | |
| if (!content.text || content.text.length < 100) { | |
| console.warn('β οΈ Ad filtering too aggressive, trying minimal cleaning...'); | |
| // Try article element with minimal filtering | |
| const article = document.querySelector('[itemprop="articleBody"], article, main'); | |
| if (article) { | |
| // Just remove scripts and styles, keep everything else | |
| const simpleClone = article.cloneNode(true); | |
| simpleClone.querySelectorAll('script, style, noscript').forEach(el => el.remove()); | |
| const simpleText = simpleClone.textContent | |
| .replace(/\s+/g, ' ') | |
| .replace(/Advertisement/gi, '') | |
| .trim(); | |
| if (simpleText.length > 100) { | |
| console.log(`β Minimal cleaning recovered ${simpleText.length} characters`); | |
| content.text = simpleText; | |
| } | |
| } | |
| } | |
| // Extract individual articles if multiple found | |
| content.articles = this.extractMultipleArticles(); | |
| // Calculate extraction confidence | |
| content.confidence = this.calculateExtractionConfidence(content); | |
| console.log('Content Extraction Summary:', { | |
| pageType: content.pageType, | |
| textLength: content.text.length, | |
| articlesFound: content.articles.length, | |
| adsBlocked: content.adBlockedCount, | |
| confidence: content.confidence | |
| }); | |
| return content; | |
| }, | |
| /** | |
| * Detect specific page type (News, Blog, Social Media, etc.) | |
| */ | |
| detectPageType() { | |
| const url = window.location.href.toLowerCase(); | |
| const hostname = window.location.hostname.toLowerCase(); | |
| // Social Media Platforms | |
| const socialMediaPatterns = { | |
| 'facebook.com': 'social-facebook', | |
| 'fb.com': 'social-facebook', | |
| 'twitter.com': 'social-twitter', | |
| 'x.com': 'social-twitter', | |
| 'instagram.com': 'social-instagram', | |
| 'linkedin.com': 'social-linkedin', | |
| 'reddit.com': 'social-reddit', | |
| 'tiktok.com': 'social-tiktok', | |
| 'youtube.com': 'social-youtube', | |
| 'pinterest.com': 'social-pinterest', | |
| 'snapchat.com': 'social-snapchat', | |
| 'whatsapp.com': 'social-whatsapp', | |
| 'telegram.org': 'social-telegram', | |
| 'discord.com': 'social-discord', | |
| 'mastodon': 'social-mastodon' | |
| }; | |
| for (const [domain, type] of Object.entries(socialMediaPatterns)) { | |
| if (hostname.includes(domain)) { | |
| console.log(`β Page Type: ${type}`); | |
| return type; | |
| } | |
| } | |
| // News Portals (Major domains) | |
| const newsPortals = [ | |
| // US/International News | |
| 'bbc.com', 'cnn.com', 'nytimes.com', 'theguardian.com', 'reuters.com', | |
| 'apnews.com', 'bloomberg.com', 'wsj.com', 'washingtonpost.com', | |
| 'forbes.com', 'time.com', 'newsweek.com', 'usatoday.com', | |
| 'nbcnews.com', 'abcnews.go.com', 'cbsnews.com', 'foxnews.com', | |
| 'aljazeera.com', 'economist.com', 'politico.com', 'thehill.com', | |
| 'axios.com', 'vice.com', 'vox.com', 'buzzfeednews.com', 'huffpost.com', | |
| // Indian News | |
| 'timesofindia.indiatimes.com', 'hindustantimes.com', 'indianexpress.com', | |
| 'ndtv.com', 'thehindu.com', 'news18.com', 'dnaindia.com', 'india.com', | |
| 'firstpost.com', 'thequint.com', 'scroll.in', 'thenewsminute.com', | |
| // Global News | |
| 'dw.com', 'france24.com', 'rt.com', 'sputniknews.com', | |
| 'scmp.com', 'japantimes.co.jp', 'straitstimes.com' | |
| ]; | |
| for (const domain of newsPortals) { | |
| if (hostname.includes(domain)) { | |
| console.log('β Page Type: news-portal'); | |
| return 'news-portal'; | |
| } | |
| } | |
| // Blog Detection | |
| const blogIndicators = [ | |
| 'blog', 'wordpress', 'blogspot', 'medium.com', 'substack.com', | |
| 'ghost.io', 'tumblr.com', 'blogger.com', 'wix.com/blog', | |
| 'squarespace.com', 'weebly.com' | |
| ]; | |
| for (const indicator of blogIndicators) { | |
| if (hostname.includes(indicator) || url.includes('/blog/')) { | |
| console.log('β Page Type: blog'); | |
| return 'blog'; | |
| } | |
| } | |
| // Check DOM for blog indicators | |
| if (document.querySelector('.blog, .post, article.entry, .blog-post')) { | |
| console.log('β Page Type: blog (DOM)'); | |
| return 'blog'; | |
| } | |
| // Check for article tag | |
| if (document.querySelector('article')) { | |
| console.log('β Page Type: article-page'); | |
| return 'article-page'; | |
| } | |
| console.log('β Page Type: unknown'); | |
| return 'unknown'; | |
| }, | |
| /** | |
| * Extract page title | |
| */ | |
| extractTitle() { | |
| // Try Open Graph title | |
| const ogTitle = document.querySelector('meta[property="og:title"]'); | |
| if (ogTitle) { | |
| return ogTitle.getAttribute('content'); | |
| } | |
| // Try Twitter title | |
| const twitterTitle = document.querySelector('meta[name="twitter:title"]'); | |
| if (twitterTitle) { | |
| return twitterTitle.getAttribute('content'); | |
| } | |
| // Try article title | |
| const h1 = document.querySelector('article h1, .article h1, h1.title, h1.headline'); | |
| if (h1) { | |
| return h1.textContent.trim(); | |
| } | |
| // Fallback to page title | |
| return document.title; | |
| }, | |
| /** | |
| * Extract metadata (author, date, etc.) | |
| */ | |
| extractMetadata() { | |
| const metadata = { | |
| author: '', | |
| publishDate: '' | |
| }; | |
| // Extract author | |
| const authorSelectors = [ | |
| 'meta[name="author"]', | |
| 'meta[property="article:author"]', | |
| '[rel="author"]', | |
| '.author', | |
| '.byline', | |
| '.author-name', | |
| '[itemprop="author"]' | |
| ]; | |
| for (let selector of authorSelectors) { | |
| const element = document.querySelector(selector); | |
| if (element) { | |
| metadata.author = element.getAttribute('content') || element.textContent.trim(); | |
| if (metadata.author) break; | |
| } | |
| } | |
| // Extract publish date | |
| const dateSelectors = [ | |
| 'meta[property="article:published_time"]', | |
| 'meta[name="publish-date"]', | |
| 'time[datetime]', | |
| '.publish-date', | |
| '.date', | |
| '[itemprop="datePublished"]' | |
| ]; | |
| for (let selector of dateSelectors) { | |
| const element = document.querySelector(selector); | |
| if (element) { | |
| metadata.publishDate = element.getAttribute('content') || | |
| element.getAttribute('datetime') || | |
| element.textContent.trim(); | |
| if (metadata.publishDate) break; | |
| } | |
| } | |
| return metadata; | |
| }, | |
| /** | |
| * Extract main text content with comprehensive ad blocking | |
| */ | |
| extractText() { | |
| // Try common article selectors - EXPANDED LIST | |
| const contentSelectors = [ | |
| // BBC SPECIFIC (HIGH PRIORITY) | |
| 'article[data-component="text-block"]', // BBC article blocks | |
| '[data-component="article-body"]', // BBC article body | |
| '.article__body-content', // BBC body content | |
| '.ssrcss-pv1rh6-ArticleWrapper', // BBC modern wrapper | |
| 'main article', // BBC main article | |
| // Schema.org markup | |
| '[itemprop="articleBody"]', | |
| 'div[itemprop="articleBody"]', | |
| // Article tags | |
| 'article', | |
| '[role="article"]', | |
| // NDTV specific | |
| '.ins_storybody', // Main story body | |
| '.sp-cn', // Story container | |
| '#h_iframe', // Story iframe | |
| '.fullstorydiv', // Full story | |
| // Other Indian news sites | |
| '.article_block', // India Today | |
| '.highlights', // Times of India | |
| '.story_content', // Hindustan Times | |
| '.article-text', // The Hindu | |
| // Common news classes | |
| '.article-content', | |
| '.article-body', | |
| '.article__body', | |
| '.post-content', | |
| '.entry-content', | |
| '.story-body', | |
| '.story-content', | |
| '.content-body', | |
| // International news | |
| '.story-body__inner', // BBC old | |
| '.article__content', // Guardian | |
| '.paywall', // NYT | |
| // Generic | |
| '.content', | |
| 'main', | |
| '#main-content' | |
| ]; | |
| let mainElement = null; | |
| let maxLength = 0; | |
| // Find element with most content | |
| for (let selector of contentSelectors) { | |
| try { | |
| const element = document.querySelector(selector); | |
| if (element) { | |
| const textLength = element.textContent.trim().length; | |
| if (textLength > maxLength && textLength > 100) { | |
| mainElement = element; | |
| maxLength = textLength; | |
| console.log(`β Found content with "${selector}": ${textLength} chars`); | |
| } | |
| } | |
| } catch (e) { | |
| // Skip invalid selectors | |
| } | |
| } | |
| // Fallback to body if no article found | |
| if (!mainElement || maxLength < 100) { | |
| console.log('β οΈ Using body as fallback'); | |
| mainElement = document.body; | |
| } | |
| // Extract and clean text | |
| return this.cleanText(mainElement); | |
| }, | |
| /** | |
| * Clean text - BALANCED AD FILTERING (not too aggressive) | |
| */ | |
| cleanText(element) { | |
| const clone = element.cloneNode(true); | |
| let adsBlocked = 0; | |
| // === Remove ONLY obvious unwanted elements (reduced list) === | |
| const unwantedSelectors = [ | |
| // Scripts and styles - ALWAYS remove | |
| 'script', 'style', 'noscript', 'iframe', 'embed', 'object', | |
| // Navigation - be selective | |
| 'nav', 'header', 'footer', 'menu', | |
| // === ADS - Only obvious ad classes === | |
| '.advertisement', '.ad-container', '.ad-banner', '.ad-slot', | |
| '[id*="google_ads"]', 'ins.adsbygoogle', | |
| '.taboola', '.outbrain', '.mgid', | |
| // Comments | |
| '.comments', '.comment-section', '.disqus', | |
| // Popups | |
| '.popup', '.modal', '.overlay' | |
| ]; | |
| unwantedSelectors.forEach(selector => { | |
| try { | |
| const elements = clone.querySelectorAll(selector); | |
| adsBlocked += elements.length; | |
| elements.forEach(el => el.remove()); | |
| } catch (e) { | |
| // Skip invalid selectors | |
| } | |
| }); | |
| // Extract text from paragraphs and headings FIRST (preserves article content) | |
| const contentElements = clone.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, td, div[class*="content"], div[class*="article"], div[class*="story"]'); | |
| let text = ''; | |
| if (contentElements.length > 0) { | |
| contentElements.forEach(el => { | |
| const elText = el.textContent.trim(); | |
| if (elText.length > 20) { // Only include substantial text | |
| text += elText + ' '; | |
| } | |
| }); | |
| } | |
| // Fallback: if no content found, use all text | |
| if (text.length < 100) { | |
| text = clone.textContent; | |
| } | |
| // Clean text with proper spacing | |
| text = text | |
| .replace(/\s+/g, ' ') // Collapse multiple spaces | |
| .replace(/\n{3,}/g, '\n\n') // Remove excessive newlines | |
| .replace(/Advertisement/gi, '') | |
| .replace(/Sponsored/gi, '') | |
| .replace(/ShareSave/gi, '') // Remove social buttons | |
| .trim(); | |
| console.log(`π‘οΈ Blocked ${adsBlocked} ad elements`); | |
| console.log(`π Extracted text length: ${text.length} characters`); | |
| if (text.length === 0) { | |
| console.warn('β οΈ Warning: cleanText() returned 0 characters after filtering!'); | |
| console.warn('β οΈ Trying absolute fallback...'); | |
| // ABSOLUTE FALLBACK: Just get all text, minimal cleaning | |
| text = element.textContent | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| console.log(`π Fallback extracted: ${text.length} characters`); | |
| } | |
| return { text, adsBlocked }; | |
| }, | |
| /** | |
| * Remove ads by attributes | |
| */ | |
| removeAdElementsByAttributes(element) { | |
| const adKeywords = [ | |
| 'ad', 'ads', 'advert', 'sponsor', 'promo', 'banner', | |
| 'commercial', 'marketing', 'affiliate' | |
| ]; | |
| const allElements = element.querySelectorAll('*'); | |
| let removed = 0; | |
| allElements.forEach(el => { | |
| const className = (el.className || '').toString().toLowerCase(); | |
| const id = (el.id || '').toLowerCase(); | |
| const dataAttrs = Array.from(el.attributes) | |
| .filter(attr => attr.name.startsWith('data-')) | |
| .map(attr => `${attr.name}=${attr.value}`.toLowerCase()) | |
| .join(' '); | |
| const combined = `${className} ${id} ${dataAttrs}`; | |
| if (adKeywords.some(kw => combined.includes(kw))) { | |
| el.remove(); | |
| removed++; | |
| } | |
| }); | |
| return removed; | |
| }, | |
| /** | |
| * Remove suspicious elements | |
| */ | |
| removeSuspiciousElements(element) { | |
| let removed = 0; | |
| const containers = element.querySelectorAll('div, section'); | |
| containers.forEach(container => { | |
| const text = container.textContent.trim(); | |
| const links = container.querySelectorAll('a'); | |
| if (text.length > 0) { | |
| const linkText = Array.from(links).reduce((sum, link) => | |
| sum + link.textContent.length, 0); | |
| const linkDensity = linkText / text.length; | |
| if (linkDensity > 0.8 && text.length < 500) { | |
| container.remove(); | |
| removed++; | |
| } | |
| } | |
| }); | |
| return removed; | |
| }, | |
| /** | |
| * Extract multiple articles | |
| */ | |
| extractMultipleArticles() { | |
| const articles = []; | |
| const articleSelectors = [ | |
| 'article', '[itemtype*="Article"]', '.article', | |
| '.post', '.story', '.news-item', '[role="article"]' | |
| ]; | |
| const found = new Set(); | |
| for (const selector of articleSelectors) { | |
| const elements = document.querySelectorAll(selector); | |
| elements.forEach(element => { | |
| if (found.has(element)) return; | |
| const cleanResult = this.cleanText(element); | |
| const articleText = cleanResult.text; | |
| if (articleText.length < 200) return; | |
| articles.push({ | |
| index: articles.length, | |
| text: articleText, | |
| title: this.extractArticleTitle(element), | |
| author: this.extractArticleAuthor(element), | |
| publishDate: this.extractArticleDate(element), | |
| url: this.extractArticleURL(element), | |
| wordCount: articleText.split(/\s+/).length, | |
| selector: selector | |
| }); | |
| found.add(element); | |
| if (articles.length >= 10) return; | |
| }); | |
| if (articles.length >= 10) break; | |
| } | |
| console.log(`π Found ${articles.length} articles`); | |
| return articles; | |
| }, | |
| extractArticleTitle(element) { | |
| const selectors = ['h1', 'h2', 'h3', '.title', '.headline', '[itemprop="headline"]']; | |
| for (const sel of selectors) { | |
| const el = element.querySelector(sel); | |
| if (el && el.textContent.trim().length > 10) { | |
| return el.textContent.trim(); | |
| } | |
| } | |
| return ''; | |
| }, | |
| extractArticleAuthor(element) { | |
| const selectors = ['[rel="author"]', '.author', '.byline', '[itemprop="author"]']; | |
| for (const sel of selectors) { | |
| const el = element.querySelector(sel); | |
| if (el) return el.textContent.trim().replace(/^by\s+/i, ''); | |
| } | |
| return ''; | |
| }, | |
| extractArticleDate(element) { | |
| const selectors = ['time[datetime]', '.publish-date', '[itemprop="datePublished"]']; | |
| for (const sel of selectors) { | |
| const el = element.querySelector(sel); | |
| if (el) return el.getAttribute('datetime') || el.textContent.trim(); | |
| } | |
| return ''; | |
| }, | |
| extractArticleURL(element) { | |
| const link = element.querySelector('a[href]'); | |
| if (link) { | |
| const href = link.getAttribute('href'); | |
| if (href && href.startsWith('http')) return href; | |
| if (href && !href.startsWith('#')) { | |
| try { | |
| return new URL(href, window.location.origin).href; | |
| } catch (e) {} | |
| } | |
| } | |
| return window.location.href; | |
| }, | |
| /** | |
| * Calculate extraction confidence | |
| */ | |
| calculateExtractionConfidence(content) { | |
| let confidence = 0; | |
| if (content.title && content.title.length > 10) confidence += 15; | |
| if (content.author) confidence += 10; | |
| if (content.publishDate) confidence += 10; | |
| if (content.text.length > 300) confidence += 15; | |
| if (content.text.length > 800) confidence += 10; | |
| if (content.text.length > 1500) confidence += 10; | |
| if (content.pageType !== 'unknown') confidence += 15; | |
| if (content.articles.length > 0) confidence += 10; | |
| if (content.adBlockedCount > 0) confidence += 10; | |
| return Math.min(confidence, 100); | |
| } | |
| }; | |
| window.ContentExtractor = ContentExtractor; | |