tududi/backend/routes/url.js
Antonis Anastasiadis e594d1075b
Linting cleanup (#99)
* Add eslint and prettier dependencies and configs

* Lint project.
2025-07-01 11:40:09 +03:00

383 lines
13 KiB
JavaScript

const express = require('express');
const https = require('https');
const http = require('http');
const { URL } = require('url');
const router = express.Router();
// Fast regex-based metadata extraction (much faster than cheerio for head content)
function extractMetadataFromHtml(html) {
try {
// Extract title with priority: og:title > twitter:title > title tag
let title = null;
// Try og:title first
const ogTitleMatch = html.match(
/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i
);
if (ogTitleMatch) {
title = ogTitleMatch[1];
} else {
// Try twitter:title
const twitterTitleMatch = html.match(
/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i
);
if (twitterTitleMatch) {
title = twitterTitleMatch[1];
} else {
// Fallback to title tag
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
if (titleMatch) {
title = titleMatch[1].trim();
}
}
}
// Clean up title
if (title) {
title = title.trim();
// Decode common HTML entities
title = title
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
if (title.length > 100) {
title = title.substring(0, 100) + '...';
}
}
// Extract image with priority: og:image > twitter:image
let image = null;
const ogImageMatch = html.match(
/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i
);
if (ogImageMatch) {
image = ogImageMatch[1];
} else {
const twitterImageMatch = html.match(
/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i
);
if (twitterImageMatch) {
image = twitterImageMatch[1];
}
}
// Extract description
let description = null;
const ogDescMatch = html.match(
/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i
);
if (ogDescMatch) {
description = ogDescMatch[1];
} else {
const twitterDescMatch = html.match(
/<meta[^>]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']/i
);
if (twitterDescMatch) {
description = twitterDescMatch[1];
} else {
const metaDescMatch = html.match(
/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i
);
if (metaDescMatch) {
description = metaDescMatch[1];
}
}
}
if (description && description.length > 150) {
description = description.substring(0, 150) + '...';
}
return {
title,
image,
description,
};
} catch (error) {
console.error('Error parsing HTML:', error);
return { title: null, image: null, description: null };
}
}
// Helper function to check if text is a URL
function isUrl(text) {
const urlRegex =
/^(https?:\/\/)?[a-z0-9]+([-.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/i;
return urlRegex.test(text.trim());
}
// Helper function to resolve relative URLs to absolute URLs
function resolveUrl(baseUrl, relativeUrl) {
try {
return new URL(relativeUrl, baseUrl).href;
} catch {
return relativeUrl;
}
}
// Helper function to handle YouTube URLs specially
function handleYouTubeUrl(url) {
const youtubeRegex =
/(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]{11})/;
const match = url.match(youtubeRegex);
if (match) {
const videoId = match[1];
// For now, return basic YouTube info - this is fast and reliable
return {
title: 'YouTube Video',
image: `https://img.youtube.com/vi/${videoId}/maxresdefault.jpg`,
description: 'YouTube video',
};
}
return null;
}
// Helper function to fetch URL metadata with redirect handling
async function fetchUrlMetadata(url, maxRedirects = 5) {
return new Promise((resolve) => {
// Add protocol if missing
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'http://' + url;
}
// Handle YouTube URLs specially to avoid anti-bot issues
if (url.includes('youtube.com') || url.includes('youtu.be')) {
const youtubeMetadata = handleYouTubeUrl(url);
if (youtubeMetadata) {
resolve(youtubeMetadata);
} else {
resolve(null);
}
return;
}
// Global timeout for the entire operation
const globalTimeout = setTimeout(() => {
resolve(null);
}, 3000); // 3 second max for entire operation
function makeRequest(currentUrl, redirectCount = 0) {
if (redirectCount > maxRedirects) {
clearTimeout(globalTimeout);
resolve(null);
return;
}
try {
const urlObj = new URL(currentUrl);
const isHttps = urlObj.protocol === 'https:';
const client = isHttps ? https : http;
const options = {
hostname: urlObj.hostname,
port: urlObj.port || (isHttps ? 443 : 80),
path: urlObj.pathname + urlObj.search,
method: 'GET',
timeout: 2000, // Reduced from 5000ms to 2000ms
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
},
};
const req = client.request(options, (res) => {
// Handle redirects (301, 302, 303, 307, 308)
if (
[301, 302, 303, 307, 308].includes(res.statusCode) &&
res.headers.location
) {
const redirectUrl = new URL(
res.headers.location,
currentUrl
).href;
makeRequest(redirectUrl, redirectCount + 1);
return;
}
// If not a successful response, resolve with null
if (res.statusCode < 200 || res.statusCode >= 400) {
clearTimeout(globalTimeout);
resolve(null);
return;
}
let data = '';
let totalBytes = 0;
const maxBytes = 20000; // Reduced from 100KB to 20KB - most meta tags are in head
let foundMeta = false;
res.on('data', (chunk) => {
totalBytes += chunk.length;
if (totalBytes > maxBytes) {
clearTimeout(globalTimeout);
req.destroy();
return;
}
data += chunk;
// Early termination if we've found essential meta tags and closed head
if (
!foundMeta &&
(data.includes('og:title') ||
data.includes('twitter:title') ||
data.includes('</title>'))
) {
foundMeta = true;
}
// Stop early if we have meta tags and hit end of head
if (foundMeta && data.includes('</head>')) {
clearTimeout(globalTimeout);
req.destroy();
return;
}
});
res.on('end', () => {
clearTimeout(globalTimeout);
const metadata = extractMetadataFromHtml(data);
// Resolve relative image URLs to absolute
if (
metadata.image &&
!metadata.image.startsWith('http')
) {
metadata.image = resolveUrl(
currentUrl,
metadata.image
);
}
resolve(metadata);
});
});
req.on('error', (err) => {
clearTimeout(globalTimeout);
resolve(null);
});
req.on('timeout', () => {
clearTimeout(globalTimeout);
req.destroy();
resolve(null);
});
req.end();
} catch (error) {
clearTimeout(globalTimeout);
resolve(null);
}
}
makeRequest(url);
});
}
// GET /api/url/title
router.get('/url/title', async (req, res) => {
try {
if (!req.session || !req.session.userId) {
return res.status(401).json({ error: 'Authentication required' });
}
const { url } = req.query;
if (!url) {
return res.status(400).json({ error: 'URL parameter is required' });
}
const metadata = await fetchUrlMetadata(url);
if (metadata && metadata.title) {
res.json({
url,
title: metadata.title,
image: metadata.image,
description: metadata.description,
});
} else {
res.json({
url,
title: null,
image: null,
description: null,
error: 'Could not extract metadata',
});
}
} catch (error) {
console.error('Error extracting URL title:', error);
res.status(500).json({ error: 'Internal server error' });
}
});
// POST /api/url/extract-from-text
router.post('/url/extract-from-text', async (req, res) => {
try {
if (!req.session || !req.session.userId) {
return res.status(401).json({ error: 'Authentication required' });
}
const { text } = req.body;
if (!text) {
return res
.status(400)
.json({ error: 'Text parameter is required' });
}
// Enhanced URL extraction - look for URLs with or without protocol
const urlWithProtocolRegex = /(https?:\/\/[^\s]+)/gi;
const urlWithoutProtocolRegex =
/(?:^|\s)((?:www\.)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?::[0-9]{1,5})?(?:\/[^\s]*)?)/gi;
let urls = text.match(urlWithProtocolRegex);
// If no URLs with protocol found, look for URLs without protocol
if (!urls) {
const matches = text.match(urlWithoutProtocolRegex);
if (matches) {
// Clean up the matches (remove leading whitespace)
urls = matches.map((match) => match.trim());
}
}
if (urls && urls.length > 0) {
const firstUrl = urls[0];
const metadata = await fetchUrlMetadata(firstUrl);
if (metadata && metadata.title) {
res.json({
found: true,
url: firstUrl,
title: metadata.title,
image: metadata.image,
description: metadata.description,
originalText: text,
});
} else {
res.json({
found: true,
url: firstUrl,
title: null,
image: null,
description: null,
originalText: text,
});
}
} else {
res.json({ found: false });
}
} catch (error) {
console.error('Error extracting URL from text:', error);
res.status(500).json({ error: 'Internal server error' });
}
});
module.exports = router;