diff --git a/db/newtube.db b/db/newtube.db index f841335..0254ffd 100644 Binary files a/db/newtube.db and b/db/newtube.db differ diff --git a/server/rumble.mjs b/server/rumble.mjs index 5a419fd..d283b9b 100644 --- a/server/rumble.mjs +++ b/server/rumble.mjs @@ -5,29 +5,26 @@ import rateLimit from 'express-rate-limit'; const router = express.Router(); -// Rate limiter for Rumble scraping to prevent being blocked +/* ----------------------------- Rate limiting ----------------------------- */ const rumbleLimiter = rateLimit({ - windowMs: 60 * 1000, // 1 min + windowMs: 60 * 1000, max: 20, standardHeaders: true, legacyHeaders: false, - message: { error: 'Too many requests to Rumble API. Please try again later.' } + message: { error: 'Too many requests to Rumble. Please try again later.' } }); - router.use(rumbleLimiter); -// Simple in-memory cache with TTL +/* --------------------------------- Cache -------------------------------- */ const cache = new Map(); const TTL_MS = 60 * 1000; // 60s function cacheKey(path, params) { return `${path}?${new URLSearchParams(params).toString()}`; } - function setCache(key, data) { cache.set(key, { data, expires: Date.now() + TTL_MS }); } - function getCache(key) { const hit = cache.get(key); if (!hit) return null; @@ -35,56 +32,202 @@ function getCache(key) { return hit.data; } +/* ------------------------------- HTTP GET -------------------------------- */ async function httpGet(url) { const resp = await axios.get(url, { headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + // UA “desktop” moderne pour minimiser les anti-bot simples + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36', + 'Accept': + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8' }, - timeout: 15000 + timeout: 15000, + // Important: pas de redirects inter-domain hasardeux + maxRedirects: 3, + validateStatus: s => s >= 200 && s < 400 }); return resp.data; } -async function scrapeRumbleVideo(videoId) { - try { - const html = await httpGet(`https://rumble.com/${videoId}`); - const $ = cheerio.load(html); - const title = $('h1.video-title, .video-title h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || ''; - const thumbnail = $('meta[property="og:image"]').attr('content') || ''; - const uploaderName = $('.media-by--a, .channel-name').first().text().trim() || ''; - const viewsText = $('.rumbles-views, .video-views').first().text().trim(); - const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0; - const durationText = $('meta[property="video:duration"]').attr('content'); - const duration = durationText ? parseInt(durationText) : 0; - const uploadedDate = $('meta[property="article:published_time"]').attr('content') || ''; - const description = $('meta[property="og:description"]').attr('content') || ''; - // Try to extract the official embed URL - let embedUrl = $('meta[property="og:video"], meta[name="twitter:player"]').attr('content') || ''; - if (!embedUrl) { - const iframeSrc = $('iframe[src*="/embed/"]').attr('src') || ''; - embedUrl = iframeSrc || ''; - } - // Normalize protocol-less URLs - if (embedUrl && embedUrl.startsWith('//')) embedUrl = 'https:' + embedUrl; - // Detect canonical URL to extract stable ID - const canonicalUrl = $('link[rel="canonical"]').attr('href') || $('meta[property="og:url"]').attr('content') || ''; - // Normalize/derive the stable Rumble ID (e.g., v464efu) - let stableId = videoId; - const mEmbed = /\/embed\/(v[0-9A-Za-z]+)/.exec(embedUrl || ''); - const mCanon = /\/(v[0-9A-Za-z]+)(?:[\-./]|$)/.exec(canonicalUrl || ''); - if (mEmbed && mEmbed[1]) stableId = mEmbed[1]; - else if (mCanon && mCanon[1]) stableId = mCanon[1]; - // If embedUrl is a page URL, convert to embed path as a fallback - if (!/\/embed\//.test(embedUrl)) { - embedUrl = `https://rumble.com/embed/${stableId}/?autoplay=2&muted=1`; - } - return { videoId: stableId, title: title || 'Untitled Video', thumbnail, uploaderName: uploaderName || 'Unknown Uploader', views, duration, uploadedDate, description, url: `https://rumble.com/${stableId}`, embedUrl, type: 'video' }; - } catch (e) { - console.error('scrapeRumbleVideo error:', e.message); - return { videoId, error: 'Scraping failed' }; +/* ------------------------- Utils: normalisation ID ------------------------ */ +/** + * Rumble expose plusieurs formes: + * - Page canoniques: https://rumble.com/v6siqxf-some-title.html + * - Ancienne forme: https://rumble.com/video/12345 + * - URL d’embed officielle: https://rumble.com/embed/v6siqxf/ + * - ID brut attendu: v6siqxf (toujours commence par 'v' + base62) + * + * Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl } + */ +function normalizeRumbleId(input, { preferEmbed = true } = {}) { + if (!input) return null; + + let id = null; + let urlCanonique = null; + let embedUrl = null; + + // 1) Si on nous donne déjà un ID "vXXXX" + const clean = String(input).trim(); + const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean); + if (mIdOnly) { + id = clean; + urlCanonique = `https://rumble.com/${id}`; + embedUrl = `https://rumble.com/embed/${id}/`; + return { id, urlCanonique, embedUrl }; } + + // 2) Si on nous donne une URL + try { + const u = new URL(clean, 'https://rumble.com'); + // /embed/vXXXX/ + let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname); + if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname); + if (!m) { + // ancienne forme /video/123 → on ne sait pas convertir de manière fiable + const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname); + if (mOld) { + // On garde l’URL telle quelle et laissera le parseur de la page extraire le vrai vID. + return { id: null, urlCanonique: u.href, embedUrl: null }; + } + return null; + } + id = m[1]; + urlCanonique = `https://rumble.com/${id}`; + embedUrl = `https://rumble.com/embed/${id}/`; + return { id, urlCanonique, embedUrl }; + } catch { + return null; + } +} + +/* ---------------------- Parsing robuste d’une PAGE vidéo ------------------ */ +/** + * Source d’autorité pour le vrai ID: le JS inline: + * Rumble("play", {..., "video":"vXXXX", ...}) + * On prend ensuite en fallback: (souvent /embed/vXXXX/) + * puis ou (contenant /vXXXX-...). + * + * NB: ce choix est basé sur l’observation publique: la valeur "video":"vXXXX" + * est exactement l’ID attendu par l’embed officiel. + */ +function extractVideoIdentity($) { + // 1) Script "Rumble('play', {... "video":"vXXXX" ...})" + // On évite d’exécuter quoi que ce soit; simple regex sur tout le HTML. + const html = $.html() || ''; + let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html); + if (m && m[1]) { + const id = m[1]; + return { + id, + embedUrl: `https://rumble.com/embed/${id}/`, + urlCanonique: `https://rumble.com/${id}` + }; + } + + // 2) og:video → .../embed/vXXXX/... + let embed = $('meta[property="og:video"]').attr('content') + || $('meta[name="twitter:player"]').attr('content'); + if (embed) { + if (embed.startsWith('//')) embed = 'https:' + embed; + const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed); + if (mm) { + const id = mm[1]; + return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` }; + } + } + + // 3) Canonical / og:url → .../vXXXX-... + let canon = $('link[rel="canonical"]').attr('href') + || $('meta[property="og:url"]').attr('content'); + if (canon) { + const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon); + if (mm) { + const id = mm[1]; + return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` }; + } + } + + return null; +} + +/* -------------------------- Scraper d’une vidéo -------------------------- */ +async function scrapeRumbleVideo(videoIdOrUrl) { + try { + // Accepte /:videoId ou une URL complète. + let norm = normalizeRumbleId(videoIdOrUrl); + const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`; + const html = await httpGet(fetchUrl); + const $ = cheerio.load(html); + + // Identité fiable (id + embed + canonique) + let ident = extractVideoIdentity($); + if (!ident) { + // dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123 + if (!norm?.id && norm?.urlCanonique) { + ident = extractVideoIdentity($); + } + } + if (!ident?.id) { + return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl }; + } + + // Métadonnées robustes + const title = + $('h1.video-title, .video-title h1').first().text().trim() + || $('meta[property="og:title"]').attr('content') || 'Untitled Video'; + + let thumbnail = $('meta[property="og:image"]').attr('content') || ''; + if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail; + + const uploaderName = + $('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || ''; + + const viewsText = + $('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || ''; + const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0; + + const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0; + + const uploadedDate = + $('meta[property="article:published_time"]').attr('content') + || $('time[datetime]').attr('datetime') || ''; + + const description = + $('meta[property="og:description"]').attr('content') + || $('meta[name="description"]').attr('content') || ''; + + // embedUrl final — toujours la forme officielle + const embedUrl = ident.embedUrl; + + return { + videoId: ident.id, + title, + thumbnail, + uploaderName, + views, + duration, + uploadedDate, + description, + url: ident.urlCanonique, + embedUrl, + type: 'video' + }; + } catch (e) { + const msg = (e && e.message) ? e.message : String(e); + return { error: `Scraping failed: ${msg}` }; + } +} + +/* ------------------ Scraper de liste (search / browse) ------------------ */ +function parseDurationToSeconds(text) { + if (!text) return 0; + // supporte mm:ss ou hh:mm:ss + const m = text.trim().match(/^(\d{1,2}):(\d{2})(?::(\d{2}))?$/); + if (!m) return 0; + const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10); + return h * 3600 + mn * 60 + s; } async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) { @@ -92,54 +235,113 @@ async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) { const url = q ? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}` : `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`; + const html = await httpGet(url); const $ = cheerio.load(html); - const items = []; - // Try to select video cards; Rumble uses different layouts, so search broadly + + const found = []; + // 1) Cartes "vidéos" standards (li/article/div) $('a[href^="/v"], a[href^="/video/"]').each((_, el) => { - const a = $(el); - const href = a.attr('href') || ''; - // Expect href like /vabcdef or /video/abcdef - const m = href.match(/\/v([A-Za-z0-9]+)/) || href.match(/\/video\/([A-Za-z0-9]+)/); - if (!m) return; - const vid = `v${m[1]}`; - const title = a.attr('title') || a.text().trim(); - // Look around for thumbnail and meta - const parent = a.closest('li, article, div'); - const img = parent.find('img').first(); - let thumb = img.attr('data-src') || img.attr('src') || ''; + const href = $(el).attr('href') || ''; + // On préfère STRICTEMENT l’ID /vXXXX + let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href); + let id = m?.[1] || null; + + // Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page. + const isLegacy = !id && /^\/video\//.test(href); + + if (!id && !isLegacy) return; + + const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div'); + + const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim(); + + // Thumb robuste: data-src > src + let thumb = + card.find('img').first().attr('data-src') + || card.find('img').first().attr('src') + || ''; if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb; - const durationText = parent.find('.video-item--duration, .video-duration, .duration').first().text().trim(); - const viewsText = parent.find('.video-item--views, .rumbles-views, .views').first().text().trim(); - const duration = (() => { - const m = durationText.match(/(\d+):(\d+)(?::(\d+))?/); - if (!m) return 0; - const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10); - return h * 3600 + mn * 60 + s; - })(); - const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0; - items.push({ videoId: vid, title, thumbnail: thumb, uploaderName: '', views, duration, uploadedDate: '', url: `https://rumble.com/${vid}`, type: 'video' }); + + const durationText = + card.find('.video-item--duration, .video-duration, .duration, .video-item__duration').first().text().trim(); + const viewsText = + card.find('.video-item--views, .rumbles-views, .views, .video-item__views').first().text().trim(); + + const duration = parseDurationToSeconds(durationText); + const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0; + + // Important: on renvoie TOUJOURS une URL canonique cohérente + let url = null; + let videoId = null; + + if (id) { + videoId = id; + url = `https://rumble.com/${id}`; + } else if (isLegacy) { + // Laisse l’endpoint /video/:slug gérer la normalisation + videoId = href.replace(/^\//, ''); // "video/123..." + url = `https://rumble.com/${videoId}`; + } + + // Filtrage doublons par videoId (id ou "video/123...") + const key = videoId; + found.push({ + videoId: key, + title, + thumbnail: thumb, + uploaderName: '', + views, + duration, + uploadedDate: '', + url, + type: 'video' + }); }); - // De-duplicate by videoId and slice to limit + + // De-dupe const seen = new Set(); const unique = []; - for (const it of items) { if (!seen.has(it.videoId)) { seen.add(it.videoId); unique.push(it); } } + for (const it of found) { + if (!it.videoId) continue; + if (seen.has(it.videoId)) continue; + seen.add(it.videoId); + unique.push(it); + } + + // Limite + nextCursor (page-based) const list = unique.slice(0, limit); const nextCursor = list.length === limit ? String(Number(page) + 1) : null; - return { items: list, total: unique.length, page: Number(page), limit: Number(limit), nextCursor }; + + return { + items: list, + total: unique.length, + page: Number(page), + limit: Number(limit), + nextCursor + }; } catch (e) { - console.error('scrapeRumbleList error:', e.message); - return { items: [], total: 0, page: Number(page), limit: Number(limit), nextCursor: null }; + return { + items: [], + total: 0, + page: Number(page), + limit: Number(limit), + nextCursor: null, + error: (e && e.message) ? e.message : String(e) + }; } } +/* --------------------------------- Routes -------------------------------- */ router.get('/browse', async (req, res) => { - const page = parseInt(String(req.query.page || '1'), 10) || 1; - const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24); + const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1); + const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24)); const sort = String(req.query.sort || 'viral'); + const key = cacheKey('/browse', { page, limit, sort }); const cached = getCache(key); if (cached) return res.json(cached); + const data = await scrapeRumbleList({ page, limit, sort }); setCache(key, data); return res.json(data); @@ -148,37 +350,67 @@ router.get('/browse', async (req, res) => { router.get('/search', async (req, res) => { const q = String(req.query.q || '').trim(); if (!q) return res.status(400).json({ error: 'Query parameter required' }); - const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24); + + const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24)); const page = (() => { - // Support offset-based cursor from frontend by translating offset->page if (req.query.offset != null) { const offset = parseInt(String(req.query.offset), 10) || 0; return Math.floor(offset / limit) + 1; } - return parseInt(String(req.query.page || '1'), 10) || 1; + return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1); })(); + const key = cacheKey('/search', { q, page, limit }); const cached = getCache(key); if (cached) return res.json(cached); + const data = await scrapeRumbleList({ q, page, limit }); setCache(key, data); return res.json(data); }); -router.get('/video/:videoId', async (req, res) => { +// Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..." +router.get('/video/:videoId(*)', async (req, res) => { try { - const { videoId } = req.params; - const key = cacheKey('/video', { videoId }); + const raw = String(req.params.videoId); + const key = cacheKey('/video', { videoId: raw }); const cached = getCache(key); if (cached) return res.json(cached); - const videoData = await scrapeRumbleVideo(videoId); - if (videoData.error) return res.status(404).json({ error: 'Video not found or scraping failed' }); - setCache(key, videoData); - return res.json(videoData); + + // Normalise au maximum avant scrape + const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` }; + const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique); + if (data.error) return res.status(404).json(data); + + setCache(key, data); + return res.json(data); } catch (error) { - console.error('Rumble video error:', error); return res.status(500).json({ error: 'Failed to scrape video' }); } }); +/* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */ +/** + * On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable). + * Mais on peut servir un “preplay”: + * - On affiche miniature/titre. + * - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter l’iframe + * officiellement (ce qui déclenchera leur logique pub). + * Cette route renvoie juste les meta nécessaires pour ce composant prélecteur. + */ +router.get('/video/:videoId/preplay', async (req, res) => { + const raw = String(req.params.videoId); + const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` }; + const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique); + if (data.error) return res.status(404).json(data); + const preplay = { + videoId: data.videoId, + title: data.title, + thumbnail: data.thumbnail, + rumbleUrl: data.url, // bouton "Ouvrir sur le site du fournisseur" + embedUrl: data.embedUrl // injection différée si l’utilisateur insiste pour lire ici + }; + res.json(preplay); +}); + export default router;