refactor: improve Rumble video scraping with robust ID normalization and metadata parsing

This commit is contained in:
Bruno Charest 2025-09-19 09:33:54 -04:00
parent 709b2e55c2
commit d6da699c54
2 changed files with 320 additions and 88 deletions

Binary file not shown.

View File

@ -5,29 +5,26 @@ import rateLimit from 'express-rate-limit';
const router = express.Router(); const router = express.Router();
// Rate limiter for Rumble scraping to prevent being blocked /* ----------------------------- Rate limiting ----------------------------- */
const rumbleLimiter = rateLimit({ const rumbleLimiter = rateLimit({
windowMs: 60 * 1000, // 1 min windowMs: 60 * 1000,
max: 20, max: 20,
standardHeaders: true, standardHeaders: true,
legacyHeaders: false, legacyHeaders: false,
message: { error: 'Too many requests to Rumble API. Please try again later.' } message: { error: 'Too many requests to Rumble. Please try again later.' }
}); });
router.use(rumbleLimiter); router.use(rumbleLimiter);
// Simple in-memory cache with TTL /* --------------------------------- Cache -------------------------------- */
const cache = new Map(); const cache = new Map();
const TTL_MS = 60 * 1000; // 60s const TTL_MS = 60 * 1000; // 60s
function cacheKey(path, params) { function cacheKey(path, params) {
return `${path}?${new URLSearchParams(params).toString()}`; return `${path}?${new URLSearchParams(params).toString()}`;
} }
function setCache(key, data) { function setCache(key, data) {
cache.set(key, { data, expires: Date.now() + TTL_MS }); cache.set(key, { data, expires: Date.now() + TTL_MS });
} }
function getCache(key) { function getCache(key) {
const hit = cache.get(key); const hit = cache.get(key);
if (!hit) return null; if (!hit) return null;
@ -35,56 +32,202 @@ function getCache(key) {
return hit.data; return hit.data;
} }
/* ------------------------------- HTTP GET -------------------------------- */
async function httpGet(url) { async function httpGet(url) {
const resp = await axios.get(url, { const resp = await axios.get(url, {
headers: { headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', // UA “desktop” moderne pour minimiser les anti-bot simples
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8' 'Accept-Language': 'en-US,en;q=0.8'
}, },
timeout: 15000 timeout: 15000,
// Important: pas de redirects inter-domain hasardeux
maxRedirects: 3,
validateStatus: s => s >= 200 && s < 400
}); });
return resp.data; return resp.data;
} }
async function scrapeRumbleVideo(videoId) { /* ------------------------- Utils: normalisation ID ------------------------ */
try { /**
const html = await httpGet(`https://rumble.com/${videoId}`); * Rumble expose plusieurs formes:
const $ = cheerio.load(html); * - Page canoniques: https://rumble.com/v6siqxf-some-title.html
const title = $('h1.video-title, .video-title h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || ''; * - Ancienne forme: https://rumble.com/video/12345
const thumbnail = $('meta[property="og:image"]').attr('content') || ''; * - URL dembed officielle: https://rumble.com/embed/v6siqxf/
const uploaderName = $('.media-by--a, .channel-name').first().text().trim() || ''; * - ID brut attendu: v6siqxf (toujours commence par 'v' + base62)
const viewsText = $('.rumbles-views, .video-views').first().text().trim(); *
const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0; * Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl }
const durationText = $('meta[property="video:duration"]').attr('content'); */
const duration = durationText ? parseInt(durationText) : 0; function normalizeRumbleId(input, { preferEmbed = true } = {}) {
const uploadedDate = $('meta[property="article:published_time"]').attr('content') || ''; if (!input) return null;
const description = $('meta[property="og:description"]').attr('content') || '';
// Try to extract the official embed URL let id = null;
let embedUrl = $('meta[property="og:video"], meta[name="twitter:player"]').attr('content') || ''; let urlCanonique = null;
if (!embedUrl) { let embedUrl = null;
const iframeSrc = $('iframe[src*="/embed/"]').attr('src') || '';
embedUrl = iframeSrc || ''; // 1) Si on nous donne déjà un ID "vXXXX"
} const clean = String(input).trim();
// Normalize protocol-less URLs const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean);
if (embedUrl && embedUrl.startsWith('//')) embedUrl = 'https:' + embedUrl; if (mIdOnly) {
// Detect canonical URL to extract stable ID id = clean;
const canonicalUrl = $('link[rel="canonical"]').attr('href') || $('meta[property="og:url"]').attr('content') || ''; urlCanonique = `https://rumble.com/${id}`;
// Normalize/derive the stable Rumble ID (e.g., v464efu) embedUrl = `https://rumble.com/embed/${id}/`;
let stableId = videoId; return { id, urlCanonique, embedUrl };
const mEmbed = /\/embed\/(v[0-9A-Za-z]+)/.exec(embedUrl || '');
const mCanon = /\/(v[0-9A-Za-z]+)(?:[\-./]|$)/.exec(canonicalUrl || '');
if (mEmbed && mEmbed[1]) stableId = mEmbed[1];
else if (mCanon && mCanon[1]) stableId = mCanon[1];
// If embedUrl is a page URL, convert to embed path as a fallback
if (!/\/embed\//.test(embedUrl)) {
embedUrl = `https://rumble.com/embed/${stableId}/?autoplay=2&muted=1`;
}
return { videoId: stableId, title: title || 'Untitled Video', thumbnail, uploaderName: uploaderName || 'Unknown Uploader', views, duration, uploadedDate, description, url: `https://rumble.com/${stableId}`, embedUrl, type: 'video' };
} catch (e) {
console.error('scrapeRumbleVideo error:', e.message);
return { videoId, error: 'Scraping failed' };
} }
// 2) Si on nous donne une URL
try {
const u = new URL(clean, 'https://rumble.com');
// /embed/vXXXX/
let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname);
if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname);
if (!m) {
// ancienne forme /video/123 → on ne sait pas convertir de manière fiable
const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname);
if (mOld) {
// On garde lURL telle quelle et laissera le parseur de la page extraire le vrai vID.
return { id: null, urlCanonique: u.href, embedUrl: null };
}
return null;
}
id = m[1];
urlCanonique = `https://rumble.com/${id}`;
embedUrl = `https://rumble.com/embed/${id}/`;
return { id, urlCanonique, embedUrl };
} catch {
return null;
}
}
/* ---------------------- Parsing robuste dune PAGE vidéo ------------------ */
/**
* Source dautorité pour le vrai ID: le JS inline:
* Rumble("play", {..., "video":"vXXXX", ...})
* On prend ensuite en fallback: <meta property="og:video"> (souvent /embed/vXXXX/)
* puis <link rel="canonical"> ou <meta property="og:url"> (contenant /vXXXX-...).
*
* NB: ce choix est basé sur lobservation publique: la valeur "video":"vXXXX"
* est exactement lID attendu par lembed officiel.
*/
function extractVideoIdentity($) {
// 1) Script "Rumble('play', {... "video":"vXXXX" ...})"
// On évite dexécuter quoi que ce soit; simple regex sur tout le HTML.
const html = $.html() || '';
let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html);
if (m && m[1]) {
const id = m[1];
return {
id,
embedUrl: `https://rumble.com/embed/${id}/`,
urlCanonique: `https://rumble.com/${id}`
};
}
// 2) og:video → .../embed/vXXXX/...
let embed = $('meta[property="og:video"]').attr('content')
|| $('meta[name="twitter:player"]').attr('content');
if (embed) {
if (embed.startsWith('//')) embed = 'https:' + embed;
const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed);
if (mm) {
const id = mm[1];
return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
}
}
// 3) Canonical / og:url → .../vXXXX-...
let canon = $('link[rel="canonical"]').attr('href')
|| $('meta[property="og:url"]').attr('content');
if (canon) {
const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon);
if (mm) {
const id = mm[1];
return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
}
}
return null;
}
/* -------------------------- Scraper dune vidéo -------------------------- */
async function scrapeRumbleVideo(videoIdOrUrl) {
try {
// Accepte /:videoId ou une URL complète.
let norm = normalizeRumbleId(videoIdOrUrl);
const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`;
const html = await httpGet(fetchUrl);
const $ = cheerio.load(html);
// Identité fiable (id + embed + canonique)
let ident = extractVideoIdentity($);
if (!ident) {
// dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123
if (!norm?.id && norm?.urlCanonique) {
ident = extractVideoIdentity($);
}
}
if (!ident?.id) {
return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl };
}
// Métadonnées robustes
const title =
$('h1.video-title, .video-title h1').first().text().trim()
|| $('meta[property="og:title"]').attr('content') || 'Untitled Video';
let thumbnail = $('meta[property="og:image"]').attr('content') || '';
if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail;
const uploaderName =
$('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || '';
const viewsText =
$('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || '';
const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0;
const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0;
const uploadedDate =
$('meta[property="article:published_time"]').attr('content')
|| $('time[datetime]').attr('datetime') || '';
const description =
$('meta[property="og:description"]').attr('content')
|| $('meta[name="description"]').attr('content') || '';
// embedUrl final — toujours la forme officielle
const embedUrl = ident.embedUrl;
return {
videoId: ident.id,
title,
thumbnail,
uploaderName,
views,
duration,
uploadedDate,
description,
url: ident.urlCanonique,
embedUrl,
type: 'video'
};
} catch (e) {
const msg = (e && e.message) ? e.message : String(e);
return { error: `Scraping failed: ${msg}` };
}
}
/* ------------------ Scraper de liste (search / browse) ------------------ */
function parseDurationToSeconds(text) {
if (!text) return 0;
// supporte mm:ss ou hh:mm:ss
const m = text.trim().match(/^(\d{1,2}):(\d{2})(?::(\d{2}))?$/);
if (!m) return 0;
const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
return h * 3600 + mn * 60 + s;
} }
async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) { async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
@ -92,54 +235,113 @@ async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
const url = q const url = q
? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}` ? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}`
: `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`; : `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`;
const html = await httpGet(url); const html = await httpGet(url);
const $ = cheerio.load(html); const $ = cheerio.load(html);
const items = [];
// Try to select video cards; Rumble uses different layouts, so search broadly const found = [];
// 1) Cartes "vidéos" standards (li/article/div)
$('a[href^="/v"], a[href^="/video/"]').each((_, el) => { $('a[href^="/v"], a[href^="/video/"]').each((_, el) => {
const a = $(el); const href = $(el).attr('href') || '';
const href = a.attr('href') || ''; // On préfère STRICTEMENT lID /vXXXX
// Expect href like /vabcdef or /video/abcdef let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href);
const m = href.match(/\/v([A-Za-z0-9]+)/) || href.match(/\/video\/([A-Za-z0-9]+)/); let id = m?.[1] || null;
if (!m) return;
const vid = `v${m[1]}`; // Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page.
const title = a.attr('title') || a.text().trim(); const isLegacy = !id && /^\/video\//.test(href);
// Look around for thumbnail and meta
const parent = a.closest('li, article, div'); if (!id && !isLegacy) return;
const img = parent.find('img').first();
let thumb = img.attr('data-src') || img.attr('src') || ''; const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div');
const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim();
// Thumb robuste: data-src > src
let thumb =
card.find('img').first().attr('data-src')
|| card.find('img').first().attr('src')
|| '';
if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb; if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb;
const durationText = parent.find('.video-item--duration, .video-duration, .duration').first().text().trim();
const viewsText = parent.find('.video-item--views, .rumbles-views, .views').first().text().trim(); const durationText =
const duration = (() => { card.find('.video-item--duration, .video-duration, .duration, .video-item__duration').first().text().trim();
const m = durationText.match(/(\d+):(\d+)(?::(\d+))?/); const viewsText =
if (!m) return 0; card.find('.video-item--views, .rumbles-views, .views, .video-item__views').first().text().trim();
const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
return h * 3600 + mn * 60 + s; const duration = parseDurationToSeconds(durationText);
})(); const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0;
const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0;
items.push({ videoId: vid, title, thumbnail: thumb, uploaderName: '', views, duration, uploadedDate: '', url: `https://rumble.com/${vid}`, type: 'video' }); // Important: on renvoie TOUJOURS une URL canonique cohérente
let url = null;
let videoId = null;
if (id) {
videoId = id;
url = `https://rumble.com/${id}`;
} else if (isLegacy) {
// Laisse lendpoint /video/:slug gérer la normalisation
videoId = href.replace(/^\//, ''); // "video/123..."
url = `https://rumble.com/${videoId}`;
}
// Filtrage doublons par videoId (id ou "video/123...")
const key = videoId;
found.push({
videoId: key,
title,
thumbnail: thumb,
uploaderName: '',
views,
duration,
uploadedDate: '',
url,
type: 'video'
});
}); });
// De-duplicate by videoId and slice to limit
// De-dupe
const seen = new Set(); const seen = new Set();
const unique = []; const unique = [];
for (const it of items) { if (!seen.has(it.videoId)) { seen.add(it.videoId); unique.push(it); } } for (const it of found) {
if (!it.videoId) continue;
if (seen.has(it.videoId)) continue;
seen.add(it.videoId);
unique.push(it);
}
// Limite + nextCursor (page-based)
const list = unique.slice(0, limit); const list = unique.slice(0, limit);
const nextCursor = list.length === limit ? String(Number(page) + 1) : null; const nextCursor = list.length === limit ? String(Number(page) + 1) : null;
return { items: list, total: unique.length, page: Number(page), limit: Number(limit), nextCursor };
return {
items: list,
total: unique.length,
page: Number(page),
limit: Number(limit),
nextCursor
};
} catch (e) { } catch (e) {
console.error('scrapeRumbleList error:', e.message); return {
return { items: [], total: 0, page: Number(page), limit: Number(limit), nextCursor: null }; items: [],
total: 0,
page: Number(page),
limit: Number(limit),
nextCursor: null,
error: (e && e.message) ? e.message : String(e)
};
} }
} }
/* --------------------------------- Routes -------------------------------- */
router.get('/browse', async (req, res) => { router.get('/browse', async (req, res) => {
const page = parseInt(String(req.query.page || '1'), 10) || 1; const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24); const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const sort = String(req.query.sort || 'viral'); const sort = String(req.query.sort || 'viral');
const key = cacheKey('/browse', { page, limit, sort }); const key = cacheKey('/browse', { page, limit, sort });
const cached = getCache(key); const cached = getCache(key);
if (cached) return res.json(cached); if (cached) return res.json(cached);
const data = await scrapeRumbleList({ page, limit, sort }); const data = await scrapeRumbleList({ page, limit, sort });
setCache(key, data); setCache(key, data);
return res.json(data); return res.json(data);
@ -148,37 +350,67 @@ router.get('/browse', async (req, res) => {
router.get('/search', async (req, res) => { router.get('/search', async (req, res) => {
const q = String(req.query.q || '').trim(); const q = String(req.query.q || '').trim();
if (!q) return res.status(400).json({ error: 'Query parameter required' }); if (!q) return res.status(400).json({ error: 'Query parameter required' });
const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24);
const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const page = (() => { const page = (() => {
// Support offset-based cursor from frontend by translating offset->page
if (req.query.offset != null) { if (req.query.offset != null) {
const offset = parseInt(String(req.query.offset), 10) || 0; const offset = parseInt(String(req.query.offset), 10) || 0;
return Math.floor(offset / limit) + 1; return Math.floor(offset / limit) + 1;
} }
return parseInt(String(req.query.page || '1'), 10) || 1; return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
})(); })();
const key = cacheKey('/search', { q, page, limit }); const key = cacheKey('/search', { q, page, limit });
const cached = getCache(key); const cached = getCache(key);
if (cached) return res.json(cached); if (cached) return res.json(cached);
const data = await scrapeRumbleList({ q, page, limit }); const data = await scrapeRumbleList({ q, page, limit });
setCache(key, data); setCache(key, data);
return res.json(data); return res.json(data);
}); });
router.get('/video/:videoId', async (req, res) => { // Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..."
router.get('/video/:videoId(*)', async (req, res) => {
try { try {
const { videoId } = req.params; const raw = String(req.params.videoId);
const key = cacheKey('/video', { videoId }); const key = cacheKey('/video', { videoId: raw });
const cached = getCache(key); const cached = getCache(key);
if (cached) return res.json(cached); if (cached) return res.json(cached);
const videoData = await scrapeRumbleVideo(videoId);
if (videoData.error) return res.status(404).json({ error: 'Video not found or scraping failed' }); // Normalise au maximum avant scrape
setCache(key, videoData); const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
return res.json(videoData); const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
if (data.error) return res.status(404).json(data);
setCache(key, data);
return res.json(data);
} catch (error) { } catch (error) {
console.error('Rumble video error:', error);
return res.status(500).json({ error: 'Failed to scrape video' }); return res.status(500).json({ error: 'Failed to scrape video' });
} }
}); });
/* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */
/**
* On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable).
* Mais on peut servir un preplay:
* - On affiche miniature/titre.
* - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter liframe
* officiellement (ce qui déclenchera leur logique pub).
* Cette route renvoie juste les meta nécessaires pour ce composant prélecteur.
*/
router.get('/video/:videoId/preplay', async (req, res) => {
const raw = String(req.params.videoId);
const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
if (data.error) return res.status(404).json(data);
const preplay = {
videoId: data.videoId,
title: data.title,
thumbnail: data.thumbnail,
rumbleUrl: data.url, // bouton "Ouvrir sur le site du fournisseur"
embedUrl: data.embedUrl // injection différée si lutilisateur insiste pour lire ici
};
res.json(preplay);
});
export default router; export default router;