refactor: improve Rumble video scraping with robust ID normalization and metadata parsing

2025-09-19 09:33:54 -04:00 · 2025-09-19 09:33:54 -04:00 · d6da699c54
commit d6da699c54
parent 709b2e55c2
2 changed files with 320 additions and 88 deletions
--- a/db/newtube.db
+++ b/db/newtube.db
--- a/server/rumble.mjs
+++ b/server/rumble.mjs
@ -5,29 +5,26 @@ import rateLimit from 'express-rate-limit';
 const router = express.Router();
-// Rate limiter for Rumble scraping to prevent being blocked
+/* ----------------------------- Rate limiting ----------------------------- */
 const rumbleLimiter = rateLimit({
-  windowMs: 60 * 1000, // 1 min
+  windowMs: 60 * 1000,
  max: 20,
  standardHeaders: true,
  legacyHeaders: false,
-  message: { error: 'Too many requests to Rumble API. Please try again later.' }
+  message: { error: 'Too many requests to Rumble. Please try again later.' }
 });
 router.use(rumbleLimiter);
-// Simple in-memory cache with TTL
+/* --------------------------------- Cache -------------------------------- */
 const cache = new Map();
 const TTL_MS = 60 * 1000; // 60s
 function cacheKey(path, params) {
  return `${path}?${new URLSearchParams(params).toString()}`;
 }
 function setCache(key, data) {
  cache.set(key, { data, expires: Date.now() + TTL_MS });
 }
 function getCache(key) {
  const hit = cache.get(key);
  if (!hit) return null;
@ -35,56 +32,202 @@ function getCache(key) {
  return hit.data;
 }
 /* ------------------------------- HTTP GET -------------------------------- */
 async function httpGet(url) {
  const resp = await axios.get(url, {
    headers: {
-      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+      // UA “desktop” moderne pour minimiser les anti-bot simples
-      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+      'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
      'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
      'Accept-Language': 'en-US,en;q=0.8'
    },
-    timeout: 15000
+    timeout: 15000,
    // Important: pas de redirects inter-domain hasardeux
    maxRedirects: 3,
    validateStatus: s => s >= 200 && s < 400
  });
  return resp.data;
 }
-async function scrapeRumbleVideo(videoId) {
+/* ------------------------- Utils: normalisation ID ------------------------ */
-  try {
+/**
-    const html = await httpGet(`https://rumble.com/${videoId}`);
+ * Rumble expose plusieurs formes:
-    const $ = cheerio.load(html);
+ * - Page canoniques:         https://rumble.com/v6siqxf-some-title.html
-    const title = $('h1.video-title, .video-title h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || '';
+ * - Ancienne forme:          https://rumble.com/video/12345
-    const thumbnail = $('meta[property="og:image"]').attr('content') || '';
+ * - URL d’embed officielle:  https://rumble.com/embed/v6siqxf/
-    const uploaderName = $('.media-by--a, .channel-name').first().text().trim() || '';
+ * - ID brut attendu:         v6siqxf (toujours commence par 'v' + base62)
-    const viewsText = $('.rumbles-views, .video-views').first().text().trim();
+ *
-    const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0;
+ * Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl }
-    const durationText = $('meta[property="video:duration"]').attr('content');
+ */
-    const duration = durationText ? parseInt(durationText) : 0;
+function normalizeRumbleId(input, { preferEmbed = true } = {}) {
-    const uploadedDate = $('meta[property="article:published_time"]').attr('content') || '';
+  if (!input) return null;
-    const description = $('meta[property="og:description"]').attr('content') || '';
+
-    // Try to extract the official embed URL
+  let id = null;
-    let embedUrl = $('meta[property="og:video"], meta[name="twitter:player"]').attr('content') || '';
+  let urlCanonique = null;
-    if (!embedUrl) {
+  let embedUrl = null;
-      const iframeSrc = $('iframe[src*="/embed/"]').attr('src') || '';
+
-      embedUrl = iframeSrc || '';
+  // 1) Si on nous donne déjà un ID "vXXXX"
-    }
+  const clean = String(input).trim();
-    // Normalize protocol-less URLs
+  const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean);
-    if (embedUrl && embedUrl.startsWith('//')) embedUrl = 'https:' + embedUrl;
+  if (mIdOnly) {
-    // Detect canonical URL to extract stable ID
+    id = clean;
-    const canonicalUrl = $('link[rel="canonical"]').attr('href') || $('meta[property="og:url"]').attr('content') || '';
+    urlCanonique = `https://rumble.com/${id}`;
-    // Normalize/derive the stable Rumble ID (e.g., v464efu)
+    embedUrl = `https://rumble.com/embed/${id}/`;
-    let stableId = videoId;
+    return { id, urlCanonique, embedUrl };
    const mEmbed = /\/embed\/(v[0-9A-Za-z]+)/.exec(embedUrl || '');
    const mCanon = /\/(v[0-9A-Za-z]+)(?:[\-./]|$)/.exec(canonicalUrl || '');
    if (mEmbed && mEmbed[1]) stableId = mEmbed[1];
    else if (mCanon && mCanon[1]) stableId = mCanon[1];
    // If embedUrl is a page URL, convert to embed path as a fallback
    if (!/\/embed\//.test(embedUrl)) {
      embedUrl = `https://rumble.com/embed/${stableId}/?autoplay=2&muted=1`;
    }
    return { videoId: stableId, title: title || 'Untitled Video', thumbnail, uploaderName: uploaderName || 'Unknown Uploader', views, duration, uploadedDate, description, url: `https://rumble.com/${stableId}`, embedUrl, type: 'video' };
  } catch (e) {
    console.error('scrapeRumbleVideo error:', e.message);
    return { videoId, error: 'Scraping failed' };
  }
  // 2) Si on nous donne une URL
  try {
    const u = new URL(clean, 'https://rumble.com');
    // /embed/vXXXX/
    let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname);
    if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname);
    if (!m) {
      // ancienne forme /video/123 → on ne sait pas convertir de manière fiable
      const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname);
      if (mOld) {
        // On garde l’URL telle quelle et laissera le parseur de la page extraire le vrai vID.
        return { id: null, urlCanonique: u.href, embedUrl: null };
      }
      return null;
    }
    id = m[1];
    urlCanonique = `https://rumble.com/${id}`;
    embedUrl = `https://rumble.com/embed/${id}/`;
    return { id, urlCanonique, embedUrl };
  } catch {
    return null;
  }
 }
 /* ---------------------- Parsing robuste d’une PAGE vidéo ------------------ */
 /**
 * Source d’autorité pour le vrai ID: le JS inline:
 *   Rumble("play", {..., "video":"vXXXX", ...})
 * On prend ensuite en fallback: <meta property="og:video"> (souvent /embed/vXXXX/)
 * puis <link rel="canonical"> ou <meta property="og:url"> (contenant /vXXXX-...).
 *
 * NB: ce choix est basé sur l’observation publique: la valeur "video":"vXXXX"
 * est exactement l’ID attendu par l’embed officiel.
 */
 function extractVideoIdentity($) {
  // 1) Script "Rumble('play', {... "video":"vXXXX" ...})"
  //   On évite d’exécuter quoi que ce soit; simple regex sur tout le HTML.
  const html = $.html() || '';
  let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html);
  if (m && m[1]) {
    const id = m[1];
    return {
      id,
      embedUrl: `https://rumble.com/embed/${id}/`,
      urlCanonique: `https://rumble.com/${id}`
    };
  }
  // 2) og:video → .../embed/vXXXX/...
  let embed = $('meta[property="og:video"]').attr('content')
          || $('meta[name="twitter:player"]').attr('content');
  if (embed) {
    if (embed.startsWith('//')) embed = 'https:' + embed;
    const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed);
    if (mm) {
      const id = mm[1];
      return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
    }
  }
  // 3) Canonical / og:url → .../vXXXX-...
  let canon = $('link[rel="canonical"]').attr('href')
          || $('meta[property="og:url"]').attr('content');
  if (canon) {
    const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon);
    if (mm) {
      const id = mm[1];
      return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
    }
  }
  return null;
 }
 /* -------------------------- Scraper d’une vidéo -------------------------- */
 async function scrapeRumbleVideo(videoIdOrUrl) {
  try {
    // Accepte /:videoId ou une URL complète.
    let norm = normalizeRumbleId(videoIdOrUrl);
    const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`;
    const html = await httpGet(fetchUrl);
    const $ = cheerio.load(html);
    // Identité fiable (id + embed + canonique)
    let ident = extractVideoIdentity($);
    if (!ident) {
      // dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123
      if (!norm?.id && norm?.urlCanonique) {
        ident = extractVideoIdentity($);
      }
    }
    if (!ident?.id) {
      return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl };
    }
    // Métadonnées robustes
    const title =
      $('h1.video-title, .video-title h1').first().text().trim()
      || $('meta[property="og:title"]').attr('content') || 'Untitled Video';
    let thumbnail = $('meta[property="og:image"]').attr('content') || '';
    if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail;
    const uploaderName =
      $('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || '';
    const viewsText =
      $('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || '';
    const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0;
    const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0;
    const uploadedDate =
      $('meta[property="article:published_time"]').attr('content')
      || $('time[datetime]').attr('datetime') || '';
    const description =
      $('meta[property="og:description"]').attr('content')
      || $('meta[name="description"]').attr('content') || '';
    // embedUrl final — toujours la forme officielle
    const embedUrl = ident.embedUrl;
    return {
      videoId: ident.id,
      title,
      thumbnail,
      uploaderName,
      views,
      duration,
      uploadedDate,
      description,
      url: ident.urlCanonique,
      embedUrl,
      type: 'video'
    };
  } catch (e) {
    const msg = (e && e.message) ? e.message : String(e);
    return { error: `Scraping failed: ${msg}` };
  }
 }
 /* ------------------ Scraper de liste (search / browse) ------------------ */
 function parseDurationToSeconds(text) {
  if (!text) return 0;
  // supporte mm:ss ou hh:mm:ss
  const m = text.trim().match(/^(\d{1,2}):(\d{2})(?::(\d{2}))?$/);
  if (!m) return 0;
  const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
  return h * 3600 + mn * 60 + s;
 }
 async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
@ -92,54 +235,113 @@ async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
    const url = q
      ? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}`
      : `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`;
    const html = await httpGet(url);
    const $ = cheerio.load(html);
-    const items = [];
+
-    // Try to select video cards; Rumble uses different layouts, so search broadly
+    const found = [];
    // 1) Cartes "vidéos" standards (li/article/div)
    $('a[href^="/v"], a[href^="/video/"]').each((_, el) => {
-      const a = $(el);
+      const href = $(el).attr('href') || '';
-      const href = a.attr('href') || '';
+      // On préfère STRICTEMENT l’ID /vXXXX
-      // Expect href like /vabcdef or /video/abcdef
+      let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href);
-      const m = href.match(/\/v([A-Za-z0-9]+)/) || href.match(/\/video\/([A-Za-z0-9]+)/);
+      let id = m?.[1] || null;
-      if (!m) return;
+
-      const vid = `v${m[1]}`;
+      // Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page.
-      const title = a.attr('title') || a.text().trim();
+      const isLegacy = !id && /^\/video\//.test(href);
-      // Look around for thumbnail and meta
+
-      const parent = a.closest('li, article, div');
+      if (!id && !isLegacy) return;
-      const img = parent.find('img').first();
+
-      let thumb = img.attr('data-src') || img.attr('src') || '';
+      const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div');
      const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim();
      // Thumb robuste: data-src > src
      let thumb =
        card.find('img').first().attr('data-src')
        || card.find('img').first().attr('src')
        || '';
      if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb;
-      const durationText = parent.find('.video-item--duration, .video-duration, .duration').first().text().trim();
+
-      const viewsText = parent.find('.video-item--views, .rumbles-views, .views').first().text().trim();
+      const durationText =
-      const duration = (() => {
+        card.find('.video-item--duration, .video-duration, .duration, .video-item__duration').first().text().trim();
-        const m = durationText.match(/(\d+):(\d+)(?::(\d+))?/);
+      const viewsText =
-        if (!m) return 0;
+        card.find('.video-item--views, .rumbles-views, .views, .video-item__views').first().text().trim();
-        const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
+
-        return h * 3600 + mn * 60 + s;
+      const duration = parseDurationToSeconds(durationText);
-      })();
+      const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0;
-      const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0;
+
-      items.push({ videoId: vid, title, thumbnail: thumb, uploaderName: '', views, duration, uploadedDate: '', url: `https://rumble.com/${vid}`, type: 'video' });
+      // Important: on renvoie TOUJOURS une URL canonique cohérente
      let url = null;
      let videoId = null;
      if (id) {
        videoId = id;
        url = `https://rumble.com/${id}`;
      } else if (isLegacy) {
        // Laisse l’endpoint /video/:slug gérer la normalisation
        videoId = href.replace(/^\//, ''); // "video/123..."
        url = `https://rumble.com/${videoId}`;
      }
      // Filtrage doublons par videoId (id ou "video/123...")
      const key = videoId;
      found.push({
        videoId: key,
        title,
        thumbnail: thumb,
        uploaderName: '',
        views,
        duration,
        uploadedDate: '',
        url,
        type: 'video'
      });
    });
-    // De-duplicate by videoId and slice to limit
+
    // De-dupe
    const seen = new Set();
    const unique = [];
-    for (const it of items) { if (!seen.has(it.videoId)) { seen.add(it.videoId); unique.push(it); } }
+    for (const it of found) {
      if (!it.videoId) continue;
      if (seen.has(it.videoId)) continue;
      seen.add(it.videoId);
      unique.push(it);
    }
    // Limite + nextCursor (page-based)
    const list = unique.slice(0, limit);
    const nextCursor = list.length === limit ? String(Number(page) + 1) : null;
-    return { items: list, total: unique.length, page: Number(page), limit: Number(limit), nextCursor };
+
    return {
      items: list,
      total: unique.length,
      page: Number(page),
      limit: Number(limit),
      nextCursor
    };
  } catch (e) {
-    console.error('scrapeRumbleList error:', e.message);
+    return {
-    return { items: [], total: 0, page: Number(page), limit: Number(limit), nextCursor: null };
+      items: [],
      total: 0,
      page: Number(page),
      limit: Number(limit),
      nextCursor: null,
      error: (e && e.message) ? e.message : String(e)
    };
  }
 }
 /* --------------------------------- Routes -------------------------------- */
 router.get('/browse', async (req, res) => {
-  const page = parseInt(String(req.query.page || '1'), 10) || 1;
+  const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
-  const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24);
+  const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
  const sort = String(req.query.sort || 'viral');
  const key = cacheKey('/browse', { page, limit, sort });
  const cached = getCache(key);
  if (cached) return res.json(cached);
  const data = await scrapeRumbleList({ page, limit, sort });
  setCache(key, data);
  return res.json(data);
@ -148,37 +350,67 @@ router.get('/browse', async (req, res) => {
 router.get('/search', async (req, res) => {
  const q = String(req.query.q || '').trim();
  if (!q) return res.status(400).json({ error: 'Query parameter required' });
-  const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24);
+
  const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
  const page = (() => {
    // Support offset-based cursor from frontend by translating offset->page
    if (req.query.offset != null) {
      const offset = parseInt(String(req.query.offset), 10) || 0;
      return Math.floor(offset / limit) + 1;
    }
-    return parseInt(String(req.query.page || '1'), 10) || 1;
+    return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
  })();
  const key = cacheKey('/search', { q, page, limit });
  const cached = getCache(key);
  if (cached) return res.json(cached);
  const data = await scrapeRumbleList({ q, page, limit });
  setCache(key, data);
  return res.json(data);
 });
-router.get('/video/:videoId', async (req, res) => {
+// Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..."
 router.get('/video/:videoId(*)', async (req, res) => {
  try {
-    const { videoId } = req.params;
+    const raw = String(req.params.videoId);
-    const key = cacheKey('/video', { videoId });
+    const key = cacheKey('/video', { videoId: raw });
    const cached = getCache(key);
    if (cached) return res.json(cached);
-    const videoData = await scrapeRumbleVideo(videoId);
+
-    if (videoData.error) return res.status(404).json({ error: 'Video not found or scraping failed' });
+    // Normalise au maximum avant scrape
-    setCache(key, videoData);
+    const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
-    return res.json(videoData);
+    const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
    if (data.error) return res.status(404).json(data);
    setCache(key, data);
    return res.json(data);
  } catch (error) {
    console.error('Rumble video error:', error);
    return res.status(500).json({ error: 'Failed to scrape video' });
  }
 });
 /* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */
 /**
 * On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable).
 * Mais on peut servir un “preplay”:
 *  - On affiche miniature/titre.
 *  - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter l’iframe
 *    officiellement (ce qui déclenchera leur logique pub).
 * Cette route renvoie juste les meta nécessaires pour ce composant prélecteur.
 */
 router.get('/video/:videoId/preplay', async (req, res) => {
  const raw = String(req.params.videoId);
  const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
  const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
  if (data.error) return res.status(404).json(data);
  const preplay = {
    videoId: data.videoId,
    title: data.title,
    thumbnail: data.thumbnail,
    rumbleUrl: data.url,              // bouton "Ouvrir sur le site du fournisseur"
    embedUrl: data.embedUrl           // injection différée si l’utilisateur insiste pour lire ici
  };
  res.json(preplay);
 });
 export default router;