Spaces:
Sleeping
Sleeping
File size: 4,183 Bytes
2398be6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import { NextRequest, NextResponse } from 'next/server';
import * as cheerio from 'cheerio';
interface Paragraph {
index: number;
text: string;
type: string;
}
export async function POST(request: NextRequest) {
try {
const { url } = await request.json();
if (!url) {
return NextResponse.json(
{ success: false, error: 'URL is required' },
{ status: 400 }
);
}
console.log('π Scraping URL:', url);
// Fetch the webpage
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
// Extract title
let title = $('title').text().trim();
if (!title) {
title = $('h1').first().text().trim() || 'Untitled Article';
}
// Remove unwanted elements
$('script, style, nav, header, footer, aside, iframe, noscript').remove();
// Extract paragraphs from article content
const paragraphs: Paragraph[] = [];
let index = 0;
// Try to find article content in common containers
const articleSelectors = [
'article',
'[role="main"]',
'main',
'.article-content',
'.post-content',
'.entry-content',
'.content',
'body',
];
let contentHtml = $('body').html() || '';
for (const selector of articleSelectors) {
const $selected = $(selector);
if ($selected.length > 0) {
contentHtml = $selected.html() || contentHtml;
break;
}
}
const $content = cheerio.load(contentHtml);
// Extract text from paragraphs, headings, and list items
$content('p, h1, h2, h3, h4, h5, h6, li').each((_idx: any, element: any) => {
const $el = $content(element);
const text = $el.text().trim();
// Skip very short text (likely navigation or footer items)
if (text.length < 20) {
return;
}
// Skip if it's just a link
if ($el.find('a').length === 1 && $el.text() === $el.find('a').text()) {
return;
}
const tagName = element.tagName?.toLowerCase() || 'p';
paragraphs.push({
index: index++,
text: text,
type: tagName.startsWith('h') ? 'heading' : tagName === 'li' ? 'list' : 'p',
});
});
// If no paragraphs found, try to extract all text
if (paragraphs.length === 0) {
const bodyText = $content.text().trim();
if (bodyText.length > 50) {
// Split by double newlines or periods followed by whitespace
const chunks = bodyText
.split(/\n\n+|(?<=\.)\s+/)
.map((chunk: string) => chunk.trim())
.filter((chunk: string) => chunk.length >= 20);
chunks.forEach((text: string, i: number) => {
paragraphs.push({
index: i,
text: text,
type: 'p',
});
});
}
}
console.log(`β
Scraped ${paragraphs.length} paragraphs from URL`);
return NextResponse.json({
success: true,
url: url,
title: title,
paragraphs: paragraphs,
total_paragraphs: paragraphs.length,
});
} catch (error) {
console.error('β URL Scraping Error:', error);
return NextResponse.json(
{
success: false,
error: error instanceof Error ? error.message : 'Failed to scrape URL',
message: 'Unable to fetch content from this URL. Please check if the URL is accessible.',
},
{ status: 500 }
);
}
}
export async function OPTIONS(request: NextRequest) {
return new NextResponse(null, {
status: 200,
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'POST, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type',
},
});
}
|