Spaces:

zpsajst
/

linkscout-backend

Sleeping

App Files Files Community

linkscout-backend / web_interface /LinkScout /app /api /scrape-url /route.ts

zpsajst

Initial commit with environment variables for API keys

2398be6 about 1 month ago

raw

history blame

4.18 kB

	import { NextRequest, NextResponse } from 'next/server';
	import * as cheerio from 'cheerio';

	interface Paragraph {
	index: number;
	text: string;
	type: string;
	}

	export async function POST(request: NextRequest) {
	try {
	const { url } = await request.json();

	if (!url) {
	return NextResponse.json(
	{ success: false, error: 'URL is required' },
	{ status: 400 }
	);
	}

	console.log('🌐 Scraping URL:', url);

	// Fetch the webpage
	const response = await fetch(url, {
	headers: {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	},
	});

	if (!response.ok) {
	throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`);
	}

	const html = await response.text();
	const $ = cheerio.load(html);

	// Extract title
	let title = $('title').text().trim();
	if (!title) {
	title = $('h1').first().text().trim() \|\| 'Untitled Article';
	}

	// Remove unwanted elements
	$('script, style, nav, header, footer, aside, iframe, noscript').remove();

	// Extract paragraphs from article content
	const paragraphs: Paragraph[] = [];
	let index = 0;

	// Try to find article content in common containers
	const articleSelectors = [
	'article',
	'[role="main"]',
	'main',
	'.article-content',
	'.post-content',
	'.entry-content',
	'.content',
	'body',
	];

	let contentHtml = $('body').html() \|\| '';
	for (const selector of articleSelectors) {
	const $selected = $(selector);
	if ($selected.length > 0) {
	contentHtml = $selected.html() \|\| contentHtml;
	break;
	}
	}

	const $content = cheerio.load(contentHtml);

	// Extract text from paragraphs, headings, and list items
	$content('p, h1, h2, h3, h4, h5, h6, li').each((_idx: any, element: any) => {
	const $el = $content(element);
	const text = $el.text().trim();

	// Skip very short text (likely navigation or footer items)
	if (text.length < 20) {
	return;
	}

	// Skip if it's just a link
	if ($el.find('a').length === 1 && $el.text() === $el.find('a').text()) {
	return;
	}

	const tagName = element.tagName?.toLowerCase() \|\| 'p';

	paragraphs.push({
	index: index++,
	text: text,
	type: tagName.startsWith('h') ? 'heading' : tagName === 'li' ? 'list' : 'p',
	});
	});

	// If no paragraphs found, try to extract all text
	if (paragraphs.length === 0) {
	const bodyText = $content.text().trim();
	if (bodyText.length > 50) {
	// Split by double newlines or periods followed by whitespace
	const chunks = bodyText
	.split(/\n\n+\|(?<=\.)\s+/)
	.map((chunk: string) => chunk.trim())
	.filter((chunk: string) => chunk.length >= 20);

	chunks.forEach((text: string, i: number) => {
	paragraphs.push({
	index: i,
	text: text,
	type: 'p',
	});
	});
	}
	}

	console.log(`✅ Scraped ${paragraphs.length} paragraphs from URL`);

	return NextResponse.json({
	success: true,
	url: url,
	title: title,
	paragraphs: paragraphs,
	total_paragraphs: paragraphs.length,
	});

	} catch (error) {
	console.error('❌ URL Scraping Error:', error);

	return NextResponse.json(
	{
	success: false,
	error: error instanceof Error ? error.message : 'Failed to scrape URL',
	message: 'Unable to fetch content from this URL. Please check if the URL is accessible.',
	},
	{ status: 500 }
	);
	}
	}

	export async function OPTIONS(request: NextRequest) {
	return new NextResponse(null, {
	status: 200,
	headers: {
	'Access-Control-Allow-Origin': '*',
	'Access-Control-Allow-Methods': 'POST, OPTIONS',
	'Access-Control-Allow-Headers': 'Content-Type',
	},
	});
	}