Spaces:
Sleeping
Sleeping
| import { NextRequest, NextResponse } from 'next/server'; | |
| import * as cheerio from 'cheerio'; | |
| interface Paragraph { | |
| index: number; | |
| text: string; | |
| type: string; | |
| } | |
| export async function POST(request: NextRequest) { | |
| try { | |
| const { url } = await request.json(); | |
| if (!url) { | |
| return NextResponse.json( | |
| { success: false, error: 'URL is required' }, | |
| { status: 400 } | |
| ); | |
| } | |
| console.log('π Scraping URL:', url); | |
| // Fetch the webpage | |
| const response = await fetch(url, { | |
| headers: { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`); | |
| } | |
| const html = await response.text(); | |
| const $ = cheerio.load(html); | |
| // Extract title | |
| let title = $('title').text().trim(); | |
| if (!title) { | |
| title = $('h1').first().text().trim() || 'Untitled Article'; | |
| } | |
| // Remove unwanted elements | |
| $('script, style, nav, header, footer, aside, iframe, noscript').remove(); | |
| // Extract paragraphs from article content | |
| const paragraphs: Paragraph[] = []; | |
| let index = 0; | |
| // Try to find article content in common containers | |
| const articleSelectors = [ | |
| 'article', | |
| '[role="main"]', | |
| 'main', | |
| '.article-content', | |
| '.post-content', | |
| '.entry-content', | |
| '.content', | |
| 'body', | |
| ]; | |
| let contentHtml = $('body').html() || ''; | |
| for (const selector of articleSelectors) { | |
| const $selected = $(selector); | |
| if ($selected.length > 0) { | |
| contentHtml = $selected.html() || contentHtml; | |
| break; | |
| } | |
| } | |
| const $content = cheerio.load(contentHtml); | |
| // Extract text from paragraphs, headings, and list items | |
| $content('p, h1, h2, h3, h4, h5, h6, li').each((_idx: any, element: any) => { | |
| const $el = $content(element); | |
| const text = $el.text().trim(); | |
| // Skip very short text (likely navigation or footer items) | |
| if (text.length < 20) { | |
| return; | |
| } | |
| // Skip if it's just a link | |
| if ($el.find('a').length === 1 && $el.text() === $el.find('a').text()) { | |
| return; | |
| } | |
| const tagName = element.tagName?.toLowerCase() || 'p'; | |
| paragraphs.push({ | |
| index: index++, | |
| text: text, | |
| type: tagName.startsWith('h') ? 'heading' : tagName === 'li' ? 'list' : 'p', | |
| }); | |
| }); | |
| // If no paragraphs found, try to extract all text | |
| if (paragraphs.length === 0) { | |
| const bodyText = $content.text().trim(); | |
| if (bodyText.length > 50) { | |
| // Split by double newlines or periods followed by whitespace | |
| const chunks = bodyText | |
| .split(/\n\n+|(?<=\.)\s+/) | |
| .map((chunk: string) => chunk.trim()) | |
| .filter((chunk: string) => chunk.length >= 20); | |
| chunks.forEach((text: string, i: number) => { | |
| paragraphs.push({ | |
| index: i, | |
| text: text, | |
| type: 'p', | |
| }); | |
| }); | |
| } | |
| } | |
| console.log(`β Scraped ${paragraphs.length} paragraphs from URL`); | |
| return NextResponse.json({ | |
| success: true, | |
| url: url, | |
| title: title, | |
| paragraphs: paragraphs, | |
| total_paragraphs: paragraphs.length, | |
| }); | |
| } catch (error) { | |
| console.error('β URL Scraping Error:', error); | |
| return NextResponse.json( | |
| { | |
| success: false, | |
| error: error instanceof Error ? error.message : 'Failed to scrape URL', | |
| message: 'Unable to fetch content from this URL. Please check if the URL is accessible.', | |
| }, | |
| { status: 500 } | |
| ); | |
| } | |
| } | |
| export async function OPTIONS(request: NextRequest) { | |
| return new NextResponse(null, { | |
| status: 200, | |
| headers: { | |
| 'Access-Control-Allow-Origin': '*', | |
| 'Access-Control-Allow-Methods': 'POST, OPTIONS', | |
| 'Access-Control-Allow-Headers': 'Content-Type', | |
| }, | |
| }); | |
| } | |