import express from 'express';
import * as cheerio from 'cheerio';
import axios from 'axios';
import rateLimit from 'express-rate-limit';
const router = express.Router();
/* ----------------------------- Rate limiting ----------------------------- */
const rumbleLimiter = rateLimit({
windowMs: 60 * 1000,
max: 20,
standardHeaders: true,
legacyHeaders: false,
message: { error: 'Too many requests to Rumble. Please try again later.' }
});
router.use(rumbleLimiter);
/* --------------------------------- Cache -------------------------------- */
const cache = new Map();
const TTL_MS = 60 * 1000; // 60s
function cacheKey(path, params) {
return `${path}?${new URLSearchParams(params).toString()}`;
}
function setCache(key, data) {
cache.set(key, { data, expires: Date.now() + TTL_MS });
}
function getCache(key) {
const hit = cache.get(key);
if (!hit) return null;
if (Date.now() > hit.expires) { cache.delete(key); return null; }
return hit.data;
}
/* ------------------------------- HTTP GET -------------------------------- */
async function httpGet(url) {
const resp = await axios.get(url, {
headers: {
// UA “desktop” moderne pour minimiser les anti-bot simples
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8'
},
timeout: 15000,
// Important: pas de redirects inter-domain hasardeux
maxRedirects: 3,
validateStatus: s => s >= 200 && s < 400
});
return resp.data;
}
/* ------------------------- Utils: normalisation ID ------------------------ */
/**
* Rumble expose plusieurs formes:
* - Page canoniques: https://rumble.com/v6siqxf-some-title.html
* - Ancienne forme: https://rumble.com/video/12345
* - URL d’embed officielle: https://rumble.com/embed/v6siqxf/
* - ID brut attendu: v6siqxf (toujours commence par 'v' + base62)
*
* Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl }
*/
function normalizeRumbleId(input, { preferEmbed = true } = {}) {
if (!input) return null;
let id = null;
let urlCanonique = null;
let embedUrl = null;
// 1) Si on nous donne déjà un ID "vXXXX"
const clean = String(input).trim();
const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean);
if (mIdOnly) {
id = clean;
urlCanonique = `https://rumble.com/${id}`;
embedUrl = `https://rumble.com/embed/${id}/`;
return { id, urlCanonique, embedUrl };
}
// 2) Si on nous donne une URL
try {
const u = new URL(clean, 'https://rumble.com');
// /embed/vXXXX/
let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname);
if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname);
if (!m) {
// ancienne forme /video/123 → on ne sait pas convertir de manière fiable
const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname);
if (mOld) {
// On garde l’URL telle quelle et laissera le parseur de la page extraire le vrai vID.
return { id: null, urlCanonique: u.href, embedUrl: null };
}
return null;
}
id = m[1];
urlCanonique = `https://rumble.com/${id}`;
embedUrl = `https://rumble.com/embed/${id}/`;
return { id, urlCanonique, embedUrl };
} catch {
return null;
}
}
/* ---------------------- Parsing robuste d’une PAGE vidéo ------------------ */
/**
* Source d’autorité pour le vrai ID: le JS inline:
* Rumble("play", {..., "video":"vXXXX", ...})
* On prend ensuite en fallback: (souvent /embed/vXXXX/)
* puis ou (contenant /vXXXX-...).
*
* NB: ce choix est basé sur l’observation publique: la valeur "video":"vXXXX"
* est exactement l’ID attendu par l’embed officiel.
*/
function extractVideoIdentity($) {
// 1) Script "Rumble('play', {... "video":"vXXXX" ...})"
// On évite d’exécuter quoi que ce soit; simple regex sur tout le HTML.
const html = $.html() || '';
let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html);
if (m && m[1]) {
const id = m[1];
return {
id,
embedUrl: `https://rumble.com/embed/${id}/`,
urlCanonique: `https://rumble.com/${id}`
};
}
// 2) og:video → .../embed/vXXXX/...
let embed = $('meta[property="og:video"]').attr('content')
|| $('meta[name="twitter:player"]').attr('content');
if (embed) {
if (embed.startsWith('//')) embed = 'https:' + embed;
const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed);
if (mm) {
const id = mm[1];
return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
}
}
// 3) Canonical / og:url → .../vXXXX-...
let canon = $('link[rel="canonical"]').attr('href')
|| $('meta[property="og:url"]').attr('content');
if (canon) {
const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon);
if (mm) {
const id = mm[1];
return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
}
}
return null;
}
/* -------------------------- Scraper d’une vidéo -------------------------- */
async function scrapeRumbleVideo(videoIdOrUrl) {
try {
// Accepte /:videoId ou une URL complète.
let norm = normalizeRumbleId(videoIdOrUrl);
const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`;
const html = await httpGet(fetchUrl);
const $ = cheerio.load(html);
// Identité fiable (id + embed + canonique)
let ident = extractVideoIdentity($);
if (!ident) {
// dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123
if (!norm?.id && norm?.urlCanonique) {
ident = extractVideoIdentity($);
}
}
if (!ident?.id) {
return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl };
}
// Métadonnées robustes
const title =
$('h1.video-title, .video-title h1').first().text().trim()
|| $('meta[property="og:title"]').attr('content') || 'Untitled Video';
let thumbnail = $('meta[property="og:image"]').attr('content') || '';
if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail;
const uploaderName =
$('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || '';
const viewsText =
$('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || '';
const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0;
const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0;
const uploadedDate =
$('meta[property="article:published_time"]').attr('content')
|| $('time[datetime]').attr('datetime') || '';
const description =
$('meta[property="og:description"]').attr('content')
|| $('meta[name="description"]').attr('content') || '';
// embedUrl final — toujours la forme officielle
const embedUrl = ident.embedUrl;
return {
videoId: ident.id,
title,
thumbnail,
uploaderName,
views,
duration,
uploadedDate,
description,
url: ident.urlCanonique,
embedUrl,
type: 'video'
};
} catch (e) {
const msg = (e && e.message) ? e.message : String(e);
return { error: `Scraping failed: ${msg}` };
}
}
/* ------------------ Scraper de liste (search / browse) ------------------ */
function parseDurationToSeconds(text) {
if (!text) return 0;
// supporte mm:ss ou hh:mm:ss
const m = text.trim().match(/^(\d{1,2}):(\d{2})(?::(\d{2}))?$/);
if (!m) return 0;
const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
return h * 3600 + mn * 60 + s;
}
async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
try {
const url = q
? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}`
: `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`;
const html = await httpGet(url);
const $ = cheerio.load(html);
const found = [];
// 1) Cartes "vidéos" standards (li/article/div)
$('a[href^="/v"], a[href^="/video/"]').each((_, el) => {
const href = $(el).attr('href') || '';
// On préfère STRICTEMENT l’ID /vXXXX
let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href);
let id = m?.[1] || null;
// Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page.
const isLegacy = !id && /^\/video\//.test(href);
if (!id && !isLegacy) return;
const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div');
const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim();
// Thumb robuste: data-src > src
let thumb =
card.find('img').first().attr('data-src')
|| card.find('img').first().attr('src')
|| '';
if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb;
const durationText =
card.find('.video-item--duration, .video-duration, .duration, .video-item__duration').first().text().trim();
const viewsText =
card.find('.video-item--views, .rumbles-views, .views, .video-item__views').first().text().trim();
const duration = parseDurationToSeconds(durationText);
const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0;
// Important: on renvoie TOUJOURS une URL canonique cohérente
let url = null;
let videoId = null;
if (id) {
videoId = id;
url = `https://rumble.com/${id}`;
} else if (isLegacy) {
// Laisse l’endpoint /video/:slug gérer la normalisation
videoId = href.replace(/^\//, ''); // "video/123..."
url = `https://rumble.com/${videoId}`;
}
// Filtrage doublons par videoId (id ou "video/123...")
const key = videoId;
found.push({
videoId: key,
title,
thumbnail: thumb,
uploaderName: '',
views,
duration,
uploadedDate: '',
url,
type: 'video'
});
});
// De-dupe
const seen = new Set();
const unique = [];
for (const it of found) {
if (!it.videoId) continue;
if (seen.has(it.videoId)) continue;
seen.add(it.videoId);
unique.push(it);
}
// Limite + nextCursor (page-based)
const list = unique.slice(0, limit);
const nextCursor = list.length === limit ? String(Number(page) + 1) : null;
return {
items: list,
total: unique.length,
page: Number(page),
limit: Number(limit),
nextCursor
};
} catch (e) {
return {
items: [],
total: 0,
page: Number(page),
limit: Number(limit),
nextCursor: null,
error: (e && e.message) ? e.message : String(e)
};
}
}
/* --------------------------------- Routes -------------------------------- */
router.get('/browse', async (req, res) => {
const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const sort = String(req.query.sort || 'viral');
const key = cacheKey('/browse', { page, limit, sort });
const cached = getCache(key);
if (cached) return res.json(cached);
const data = await scrapeRumbleList({ page, limit, sort });
setCache(key, data);
return res.json(data);
});
router.get('/search', async (req, res) => {
const q = String(req.query.q || '').trim();
if (!q) return res.status(400).json({ error: 'Query parameter required' });
const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const page = (() => {
if (req.query.offset != null) {
const offset = parseInt(String(req.query.offset), 10) || 0;
return Math.floor(offset / limit) + 1;
}
return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
})();
const key = cacheKey('/search', { q, page, limit });
const cached = getCache(key);
if (cached) return res.json(cached);
const data = await scrapeRumbleList({ q, page, limit });
setCache(key, data);
return res.json(data);
});
// Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..."
router.get('/video/:videoId(*)', async (req, res) => {
try {
const raw = String(req.params.videoId);
const key = cacheKey('/video', { videoId: raw });
const cached = getCache(key);
if (cached) return res.json(cached);
// Normalise au maximum avant scrape
const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
if (data.error) return res.status(404).json(data);
setCache(key, data);
return res.json(data);
} catch (error) {
return res.status(500).json({ error: 'Failed to scrape video' });
}
});
/* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */
/**
* On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable).
* Mais on peut servir un “preplay”:
* - On affiche miniature/titre.
* - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter l’iframe
* officiellement (ce qui déclenchera leur logique pub).
* Cette route renvoie juste les meta nécessaires pour ce composant prélecteur.
*/
router.get('/video/:videoId/preplay', async (req, res) => {
const raw = String(req.params.videoId);
const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
if (data.error) return res.status(404).json(data);
const preplay = {
videoId: data.videoId,
title: data.title,
thumbnail: data.thumbnail,
rumbleUrl: data.url, // bouton "Ouvrir sur le site du fournisseur"
embedUrl: data.embedUrl // injection différée si l’utilisateur insiste pour lire ici
};
res.json(preplay);
});
export default router;