const express = require('express'); const https = require('https'); const http = require('http'); const { URL } = require('url'); const router = express.Router(); // Fast regex-based metadata extraction (much faster than cheerio for head content) function extractMetadataFromHtml(html) { try { // Extract title with priority: og:title > twitter:title > title tag let title = null; // Try og:title first const ogTitleMatch = html.match( /]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i ); if (ogTitleMatch) { title = ogTitleMatch[1]; } else { // Try twitter:title const twitterTitleMatch = html.match( /]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i ); if (twitterTitleMatch) { title = twitterTitleMatch[1]; } else { // Fallback to title tag const titleMatch = html.match(/]*>([^<]+)<\/title>/i); if (titleMatch) { title = titleMatch[1].trim(); } } } // Clean up title if (title) { title = title.trim(); // Decode common HTML entities title = title .replace(/</g, '<') .replace(/>/g, '>') .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, "'"); if (title.length > 100) { title = title.substring(0, 100) + '...'; } } // Extract image with priority: og:image > twitter:image let image = null; const ogImageMatch = html.match( /]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i ); if (ogImageMatch) { image = ogImageMatch[1]; } else { const twitterImageMatch = html.match( /]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i ); if (twitterImageMatch) { image = twitterImageMatch[1]; } } // Extract description let description = null; const ogDescMatch = html.match( /]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i ); if (ogDescMatch) { description = ogDescMatch[1]; } else { const twitterDescMatch = html.match( /]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']/i ); if (twitterDescMatch) { description = twitterDescMatch[1]; } else { const metaDescMatch = html.match( /]+name=["']description["'][^>]+content=["']([^"']+)["']/i ); if (metaDescMatch) { description = metaDescMatch[1]; } } } if (description && description.length > 150) { description = description.substring(0, 150) + '...'; } return { title, image, description, }; } catch (error) { console.error('Error parsing HTML:', error); return { title: null, image: null, description: null }; } } // Helper function to check if text is a URL function isUrl(text) { const urlRegex = /^(https?:\/\/)?[a-z0-9]+([-.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/i; return urlRegex.test(text.trim()); } // Helper function to resolve relative URLs to absolute URLs function resolveUrl(baseUrl, relativeUrl) { try { return new URL(relativeUrl, baseUrl).href; } catch { return relativeUrl; } } // Helper function to handle YouTube URLs specially function handleYouTubeUrl(url) { const youtubeRegex = /(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]{11})/; const match = url.match(youtubeRegex); if (match) { const videoId = match[1]; // For now, return basic YouTube info - this is fast and reliable return { title: 'YouTube Video', image: `https://img.youtube.com/vi/${videoId}/maxresdefault.jpg`, description: 'YouTube video', }; } return null; } // Helper function to fetch URL metadata with redirect handling async function fetchUrlMetadata(url, maxRedirects = 5) { return new Promise((resolve) => { // Add protocol if missing if (!url.startsWith('http://') && !url.startsWith('https://')) { url = 'http://' + url; } // Handle YouTube URLs specially to avoid anti-bot issues if (url.includes('youtube.com') || url.includes('youtu.be')) { const youtubeMetadata = handleYouTubeUrl(url); if (youtubeMetadata) { resolve(youtubeMetadata); } else { resolve(null); } return; } // Global timeout for the entire operation const globalTimeout = setTimeout(() => { resolve(null); }, 3000); // 3 second max for entire operation function makeRequest(currentUrl, redirectCount = 0) { if (redirectCount > maxRedirects) { clearTimeout(globalTimeout); resolve(null); return; } try { const urlObj = new URL(currentUrl); const isHttps = urlObj.protocol === 'https:'; const client = isHttps ? https : http; const options = { hostname: urlObj.hostname, port: urlObj.port || (isHttps ? 443 : 80), path: urlObj.pathname + urlObj.search, method: 'GET', timeout: 2000, // Reduced from 5000ms to 2000ms headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', }, }; const req = client.request(options, (res) => { // Handle redirects (301, 302, 303, 307, 308) if ( [301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location ) { const redirectUrl = new URL( res.headers.location, currentUrl ).href; makeRequest(redirectUrl, redirectCount + 1); return; } // If not a successful response, resolve with null if (res.statusCode < 200 || res.statusCode >= 400) { clearTimeout(globalTimeout); resolve(null); return; } let data = ''; let totalBytes = 0; const maxBytes = 20000; // Reduced from 100KB to 20KB - most meta tags are in head let foundMeta = false; res.on('data', (chunk) => { totalBytes += chunk.length; if (totalBytes > maxBytes) { clearTimeout(globalTimeout); req.destroy(); return; } data += chunk; // Early termination if we've found essential meta tags and closed head if ( !foundMeta && (data.includes('og:title') || data.includes('twitter:title') || data.includes('')) ) { foundMeta = true; } // Stop early if we have meta tags and hit end of head if (foundMeta && data.includes('')) { clearTimeout(globalTimeout); req.destroy(); return; } }); res.on('end', () => { clearTimeout(globalTimeout); const metadata = extractMetadataFromHtml(data); // Resolve relative image URLs to absolute if ( metadata.image && !metadata.image.startsWith('http') ) { metadata.image = resolveUrl( currentUrl, metadata.image ); } resolve(metadata); }); }); req.on('error', (err) => { clearTimeout(globalTimeout); resolve(null); }); req.on('timeout', () => { clearTimeout(globalTimeout); req.destroy(); resolve(null); }); req.end(); } catch (error) { clearTimeout(globalTimeout); resolve(null); } } makeRequest(url); }); } // GET /api/url/title router.get('/url/title', async (req, res) => { try { if (!req.session || !req.session.userId) { return res.status(401).json({ error: 'Authentication required' }); } const { url } = req.query; if (!url) { return res.status(400).json({ error: 'URL parameter is required' }); } const metadata = await fetchUrlMetadata(url); if (metadata && metadata.title) { res.json({ url, title: metadata.title, image: metadata.image, description: metadata.description, }); } else { res.json({ url, title: null, image: null, description: null, error: 'Could not extract metadata', }); } } catch (error) { console.error('Error extracting URL title:', error); res.status(500).json({ error: 'Internal server error' }); } }); // POST /api/url/extract-from-text router.post('/url/extract-from-text', async (req, res) => { try { if (!req.session || !req.session.userId) { return res.status(401).json({ error: 'Authentication required' }); } const { text } = req.body; if (!text) { return res .status(400) .json({ error: 'Text parameter is required' }); } // Enhanced URL extraction - look for URLs with or without protocol const urlWithProtocolRegex = /(https?:\/\/[^\s]+)/gi; const urlWithoutProtocolRegex = /(?:^|\s)((?:www\.)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?::[0-9]{1,5})?(?:\/[^\s]*)?)/gi; let urls = text.match(urlWithProtocolRegex); // If no URLs with protocol found, look for URLs without protocol if (!urls) { const matches = text.match(urlWithoutProtocolRegex); if (matches) { // Clean up the matches (remove leading whitespace) urls = matches.map((match) => match.trim()); } } if (urls && urls.length > 0) { const firstUrl = urls[0]; const metadata = await fetchUrlMetadata(firstUrl); if (metadata && metadata.title) { res.json({ found: true, url: firstUrl, title: metadata.title, image: metadata.image, description: metadata.description, originalText: text, }); } else { res.json({ found: true, url: firstUrl, title: null, image: null, description: null, originalText: text, }); } } else { res.json({ found: false }); } } catch (error) { console.error('Error extracting URL from text:', error); res.status(500).json({ error: 'Internal server error' }); } }); module.exports = router;