import express from 'express'; import * as cheerio from 'cheerio'; import axios from 'axios'; import rateLimit from 'express-rate-limit'; const router = express.Router(); /* ----------------------------- Rate limiting ----------------------------- */ const rumbleLimiter = rateLimit({ windowMs: 60 * 1000, max: 20, standardHeaders: true, legacyHeaders: false, message: { error: 'Too many requests to Rumble. Please try again later.' } }); router.use(rumbleLimiter); /* --------------------------------- Cache -------------------------------- */ const cache = new Map(); const TTL_MS = 60 * 1000; // 60s function cacheKey(path, params) { return `${path}?${new URLSearchParams(params).toString()}`; } function setCache(key, data) { cache.set(key, { data, expires: Date.now() + TTL_MS }); } function getCache(key) { const hit = cache.get(key); if (!hit) return null; if (Date.now() > hit.expires) { cache.delete(key); return null; } return hit.data; } /* ------------------------------- HTTP GET -------------------------------- */ async function httpGet(url) { const resp = await axios.get(url, { headers: { // UA “desktop” moderne pour minimiser les anti-bot simples 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8' }, timeout: 15000, // Important: pas de redirects inter-domain hasardeux maxRedirects: 3, validateStatus: s => s >= 200 && s < 400 }); return resp.data; } /* ------------------------- Utils: normalisation ID ------------------------ */ /** * Rumble expose plusieurs formes: * - Page canoniques: https://rumble.com/v6siqxf-some-title.html * - Ancienne forme: https://rumble.com/video/12345 * - URL d’embed officielle: https://rumble.com/embed/v6siqxf/ * - ID brut attendu: v6siqxf (toujours commence par 'v' + base62) * * Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl } */ function normalizeRumbleId(input, { preferEmbed = true } = {}) { if (!input) return null; let id = null; let urlCanonique = null; let embedUrl = null; // 1) Si on nous donne déjà un ID "vXXXX" const clean = String(input).trim(); const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean); if (mIdOnly) { id = clean; urlCanonique = `https://rumble.com/${id}`; embedUrl = `https://rumble.com/embed/${id}/`; return { id, urlCanonique, embedUrl }; } // 2) Si on nous donne une URL try { const u = new URL(clean, 'https://rumble.com'); // /embed/vXXXX/ let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname); if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname); if (!m) { // ancienne forme /video/123 → on ne sait pas convertir de manière fiable const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname); if (mOld) { // On garde l’URL telle quelle et laissera le parseur de la page extraire le vrai vID. return { id: null, urlCanonique: u.href, embedUrl: null }; } return null; } id = m[1]; urlCanonique = `https://rumble.com/${id}`; embedUrl = `https://rumble.com/embed/${id}/`; return { id, urlCanonique, embedUrl }; } catch { return null; } } /* ---------------------- Parsing robuste d’une PAGE vidéo ------------------ */ /** * Source d’autorité pour le vrai ID: le JS inline: * Rumble("play", {..., "video":"vXXXX", ...}) * On prend ensuite en fallback: (souvent /embed/vXXXX/) * puis ou (contenant /vXXXX-...). * * NB: ce choix est basé sur l’observation publique: la valeur "video":"vXXXX" * est exactement l’ID attendu par l’embed officiel. */ function extractVideoIdentity($) { // 1) Script "Rumble('play', {... "video":"vXXXX" ...})" // On évite d’exécuter quoi que ce soit; simple regex sur tout le HTML. const html = $.html() || ''; let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html); if (m && m[1]) { const id = m[1]; return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` }; } // 2) og:video → .../embed/vXXXX/... let embed = $('meta[property="og:video"]').attr('content') || $('meta[name="twitter:player"]').attr('content'); if (embed) { if (embed.startsWith('//')) embed = 'https:' + embed; const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed); if (mm) { const id = mm[1]; return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` }; } } // 3) Canonical / og:url → .../vXXXX-... let canon = $('link[rel="canonical"]').attr('href') || $('meta[property="og:url"]').attr('content'); if (canon) { const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon); if (mm) { const id = mm[1]; return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` }; } } return null; } /* -------------------------- Scraper d’une vidéo -------------------------- */ async function scrapeRumbleVideo(videoIdOrUrl) { try { // Accepte /:videoId ou une URL complète. let norm = normalizeRumbleId(videoIdOrUrl); const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`; const html = await httpGet(fetchUrl); const $ = cheerio.load(html); // Identité fiable (id + embed + canonique) let ident = extractVideoIdentity($); if (!ident) { // dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123 if (!norm?.id && norm?.urlCanonique) { ident = extractVideoIdentity($); } } if (!ident?.id) { return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl }; } // Métadonnées robustes const title = $('h1.video-title, .video-title h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || 'Untitled Video'; let thumbnail = $('meta[property="og:image"]').attr('content') || ''; if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail; const uploaderName = $('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || ''; const viewsText = $('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || ''; const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0; const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0; const uploadedDate = $('meta[property="article:published_time"]').attr('content') || $('time[datetime]').attr('datetime') || ''; const description = $('meta[property="og:description"]').attr('content') || $('meta[name="description"]').attr('content') || ''; // embedUrl final — toujours la forme officielle const embedUrl = ident.embedUrl; return { videoId: ident.id, title, thumbnail, uploaderName, views, duration, uploadedDate, description, url: ident.urlCanonique, embedUrl, type: 'video' }; } catch (e) { const msg = (e && e.message) ? e.message : String(e); return { error: `Scraping failed: ${msg}` }; } } /* ------------------ Scraper de liste (search / browse) ------------------ */ function parseDurationToSeconds(text) { if (!text) return 0; // Nettoyer le texte en supprimant les espaces et caractères non numériques inutiles const cleanText = text.trim().replace(/\s+/g, ''); // Format hh:mm:ss let m = cleanText.match(/^(\d+):(\d{2}):(\d{2})$/); if (m) { const h = parseInt(m[1], 10) || 0; const mn = parseInt(m[2], 10) || 0; const s = parseInt(m[3], 10) || 0; return h * 3600 + mn * 60 + s; } // Format mm:ss m = cleanText.match(/^(\d+):(\d{2})$/); if (m) { const mn = parseInt(m[1], 10) || 0; const s = parseInt(m[2], 10) || 0; return mn * 60 + s; } // Format avec unités (ex: 1h 30m 45s) m = cleanText.match(/(\d+h)?(\d+m)?(\d+s)?/); if (m) { const hours = m[1] ? parseInt(m[1], 10) : 0; const minutes = m[2] ? parseInt(m[2], 10) : 0; const seconds = m[3] ? parseInt(m[3], 10) : 0; return (hours * 3600) + (minutes * 60) + seconds; } // Si on arrive ici, on essaie d'extraire tous les nombres et on suppose un format mmss const numbers = cleanText.match(/\d+/g); if (numbers && numbers.length > 0) { // Si un seul nombre, on suppose que c'est en secondes if (numbers.length === 1) { return parseInt(numbers[0], 10) || 0; } // Si deux nombres, on suppose mm:ss if (numbers.length === 2) { return (parseInt(numbers[0], 10) * 60) + (parseInt(numbers[1], 10) || 0); } // Si trois nombres, on suppose hh:mm:ss if (numbers.length >= 3) { return (parseInt(numbers[0], 10) * 3600) + (parseInt(numbers[1], 10) * 60) + (parseInt(numbers[2], 10) || 0); } } return 0; // Par défaut si aucun format n'est reconnu } async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) { try { const url = q ? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}` : `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`; const html = await httpGet(url); const $ = cheerio.load(html); const found = []; // 1) Cartes "vidéos" standards (li/article/div) $('a[href^="/v"], a[href^="/video/"]').each((_, el) => { const href = $(el).attr('href') || ''; // On préfère STRICTEMENT l’ID /vXXXX let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href); let id = m?.[1] || null; // Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page. const isLegacy = !id && /^\/video\//.test(href); if (!id && !isLegacy) return; const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div'); const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim(); // Thumb robuste: data-src > src let thumb = card.find('img').first().attr('data-src') || card.find('img').first().attr('src') || ''; if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb; // Essayer plusieurs sélecteurs pour la durée, y compris les attributs data- // Ajout de plus de sélecteurs spécifiques à Rumble pour la durée const durationElement = card.find( '.video-item--duration, .video-duration, .duration, .video-item__duration, ' + '[data-duration], .videoDuration, .video-time, .time, ' + '.video-card__duration, .media__duration, .thumb-time, ' + '.video-listing-entry__duration, .video-item__duration, time' ).first(); const durationCandidates = []; if (durationElement.length) { durationCandidates.push( durationElement.attr('data-duration'), durationElement.attr('data-time'), durationElement.attr('datetime'), durationElement.attr('aria-label'), durationElement.attr('title'), durationElement.text()?.trim() ); } // Chercher dans le HTML de la carte un motif HH:MM(:SS) try { const htmlSnippet = card.html() || ''; const match = />\s*([0-9]+:[0-9]{2}(?::[0-9]{2})?)\s* 0) { durationSeconds = parsed; break; } } // Extraire les vues const viewsText = card.find('.video-item--views, .rumbles-views, .views, .video-item__views, [data-views]').first() .attr('data-views') || card.find('.video-item--views, .rumbles-views, .views, .video-item__views, .video-views').first().text().trim(); const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0; // Important: on renvoie TOUJOURS une URL canonique cohérente let url = null; let videoId = null; if (id) { videoId = id; url = `https://rumble.com/${id}`; } else if (isLegacy) { // Laisse l’endpoint /video/:slug gérer la normalisation videoId = href.replace(/^\//, ''); // "video/123..." url = `https://rumble.com/${videoId}`; } // Filtrage doublons par videoId (id ou "video/123...") const key = videoId; found.push({ videoId: key, title, thumbnail: thumb, uploaderName: '', views, duration: durationSeconds, uploadedDate: '', url, type: 'video' }); }); // De-dupe const seen = new Set(); const unique = []; for (const it of found) { if (!it.videoId) continue; if (seen.has(it.videoId)) continue; seen.add(it.videoId); unique.push(it); } // Limite + nextCursor (page-based) const list = unique.slice(0, limit); const nextCursor = list.length === limit ? String(Number(page) + 1) : null; return { items: list, total: unique.length, page: Number(page), limit: Number(limit), nextCursor }; } catch (e) { return { items: [], total: 0, page: Number(page), limit: Number(limit), nextCursor: null, error: (e && e.message) ? e.message : String(e) }; } } /* --------------------------------- Routes -------------------------------- */ router.get('/browse', async (req, res) => { const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1); const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24)); const sort = String(req.query.sort || 'viral'); const key = cacheKey('/browse', { page, limit, sort }); const cached = getCache(key); if (cached) return res.json(cached); const data = await scrapeRumbleList({ page, limit, sort }); setCache(key, data); return res.json(data); }); router.get('/search', async (req, res) => { const q = String(req.query.q || '').trim(); if (!q) return res.status(400).json({ error: 'Query parameter required' }); const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24)); const page = (() => { if (req.query.offset != null) { const offset = parseInt(String(req.query.offset), 10) || 0; return Math.floor(offset / limit) + 1; } return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1); })(); const key = cacheKey('/search', { q, page, limit }); const cached = getCache(key); if (cached) return res.json(cached); const data = await scrapeRumbleList({ q, page, limit }); setCache(key, data); return res.json(data); }); // Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..." router.get('/video/:videoId(*)', async (req, res) => { try { const raw = String(req.params.videoId); const key = cacheKey('/video', { videoId: raw }); const cached = getCache(key); if (cached) return res.json(cached); // Normalise au maximum avant scrape const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` }; const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique); if (data.error) return res.status(404).json(data); setCache(key, data); return res.json(data); } catch (error) { return res.status(500).json({ error: 'Failed to scrape video' }); } }); /* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */ /** * On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable). * Mais on peut servir un “preplay”: * - On affiche miniature/titre. * - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter l’iframe * officiellement (ce qui déclenchera leur logique pub). * Cette route renvoie juste les meta nécessaires pour ce composant prélecteur. */ router.get('/video/:videoId/preplay', async (req, res) => { const raw = String(req.params.videoId); const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` }; const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique); if (data.error) return res.status(404).json(data); const preplay = { videoId: data.videoId, title: data.title, thumbnail: data.thumbnail, rumbleUrl: data.url, // bouton "Ouvrir sur le site du fournisseur" embedUrl: data.embedUrl // injection différée si l’utilisateur insiste pour lire ici }; res.json(preplay); }); export default router;