diff --git a/db/newtube.db b/db/newtube.db
index f841335..0254ffd 100644
Binary files a/db/newtube.db and b/db/newtube.db differ
diff --git a/server/rumble.mjs b/server/rumble.mjs
index 5a419fd..d283b9b 100644
--- a/server/rumble.mjs
+++ b/server/rumble.mjs
@@ -5,29 +5,26 @@ import rateLimit from 'express-rate-limit';
const router = express.Router();
-// Rate limiter for Rumble scraping to prevent being blocked
+/* ----------------------------- Rate limiting ----------------------------- */
const rumbleLimiter = rateLimit({
- windowMs: 60 * 1000, // 1 min
+ windowMs: 60 * 1000,
max: 20,
standardHeaders: true,
legacyHeaders: false,
- message: { error: 'Too many requests to Rumble API. Please try again later.' }
+ message: { error: 'Too many requests to Rumble. Please try again later.' }
});
-
router.use(rumbleLimiter);
-// Simple in-memory cache with TTL
+/* --------------------------------- Cache -------------------------------- */
const cache = new Map();
const TTL_MS = 60 * 1000; // 60s
function cacheKey(path, params) {
return `${path}?${new URLSearchParams(params).toString()}`;
}
-
function setCache(key, data) {
cache.set(key, { data, expires: Date.now() + TTL_MS });
}
-
function getCache(key) {
const hit = cache.get(key);
if (!hit) return null;
@@ -35,56 +32,202 @@ function getCache(key) {
return hit.data;
}
+/* ------------------------------- HTTP GET -------------------------------- */
async function httpGet(url) {
const resp = await axios.get(url, {
headers: {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ // UA “desktop” moderne pour minimiser les anti-bot simples
+ 'User-Agent':
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
+ 'Accept':
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8'
},
- timeout: 15000
+ timeout: 15000,
+ // Important: pas de redirects inter-domain hasardeux
+ maxRedirects: 3,
+ validateStatus: s => s >= 200 && s < 400
});
return resp.data;
}
-async function scrapeRumbleVideo(videoId) {
- try {
- const html = await httpGet(`https://rumble.com/${videoId}`);
- const $ = cheerio.load(html);
- const title = $('h1.video-title, .video-title h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || '';
- const thumbnail = $('meta[property="og:image"]').attr('content') || '';
- const uploaderName = $('.media-by--a, .channel-name').first().text().trim() || '';
- const viewsText = $('.rumbles-views, .video-views').first().text().trim();
- const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0;
- const durationText = $('meta[property="video:duration"]').attr('content');
- const duration = durationText ? parseInt(durationText) : 0;
- const uploadedDate = $('meta[property="article:published_time"]').attr('content') || '';
- const description = $('meta[property="og:description"]').attr('content') || '';
- // Try to extract the official embed URL
- let embedUrl = $('meta[property="og:video"], meta[name="twitter:player"]').attr('content') || '';
- if (!embedUrl) {
- const iframeSrc = $('iframe[src*="/embed/"]').attr('src') || '';
- embedUrl = iframeSrc || '';
- }
- // Normalize protocol-less URLs
- if (embedUrl && embedUrl.startsWith('//')) embedUrl = 'https:' + embedUrl;
- // Detect canonical URL to extract stable ID
- const canonicalUrl = $('link[rel="canonical"]').attr('href') || $('meta[property="og:url"]').attr('content') || '';
- // Normalize/derive the stable Rumble ID (e.g., v464efu)
- let stableId = videoId;
- const mEmbed = /\/embed\/(v[0-9A-Za-z]+)/.exec(embedUrl || '');
- const mCanon = /\/(v[0-9A-Za-z]+)(?:[\-./]|$)/.exec(canonicalUrl || '');
- if (mEmbed && mEmbed[1]) stableId = mEmbed[1];
- else if (mCanon && mCanon[1]) stableId = mCanon[1];
- // If embedUrl is a page URL, convert to embed path as a fallback
- if (!/\/embed\//.test(embedUrl)) {
- embedUrl = `https://rumble.com/embed/${stableId}/?autoplay=2&muted=1`;
- }
- return { videoId: stableId, title: title || 'Untitled Video', thumbnail, uploaderName: uploaderName || 'Unknown Uploader', views, duration, uploadedDate, description, url: `https://rumble.com/${stableId}`, embedUrl, type: 'video' };
- } catch (e) {
- console.error('scrapeRumbleVideo error:', e.message);
- return { videoId, error: 'Scraping failed' };
+/* ------------------------- Utils: normalisation ID ------------------------ */
+/**
+ * Rumble expose plusieurs formes:
+ * - Page canoniques: https://rumble.com/v6siqxf-some-title.html
+ * - Ancienne forme: https://rumble.com/video/12345
+ * - URL d’embed officielle: https://rumble.com/embed/v6siqxf/
+ * - ID brut attendu: v6siqxf (toujours commence par 'v' + base62)
+ *
+ * Cette fonction accepte: ID ou URL et renvoie { id: 'vXXXX', urlCanonique, embedUrl }
+ */
+function normalizeRumbleId(input, { preferEmbed = true } = {}) {
+ if (!input) return null;
+
+ let id = null;
+ let urlCanonique = null;
+ let embedUrl = null;
+
+ // 1) Si on nous donne déjà un ID "vXXXX"
+ const clean = String(input).trim();
+ const mIdOnly = /^v[0-9A-Za-z]+$/.exec(clean);
+ if (mIdOnly) {
+ id = clean;
+ urlCanonique = `https://rumble.com/${id}`;
+ embedUrl = `https://rumble.com/embed/${id}/`;
+ return { id, urlCanonique, embedUrl };
}
+
+ // 2) Si on nous donne une URL
+ try {
+ const u = new URL(clean, 'https://rumble.com');
+ // /embed/vXXXX/
+ let m = /\/embed\/(v[0-9A-Za-z]+)/.exec(u.pathname);
+ if (!m) m = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(u.pathname);
+ if (!m) {
+ // ancienne forme /video/123 → on ne sait pas convertir de manière fiable
+ const mOld = /\/video\/([0-9A-Za-z]+)/.exec(u.pathname);
+ if (mOld) {
+ // On garde l’URL telle quelle et laissera le parseur de la page extraire le vrai vID.
+ return { id: null, urlCanonique: u.href, embedUrl: null };
+ }
+ return null;
+ }
+ id = m[1];
+ urlCanonique = `https://rumble.com/${id}`;
+ embedUrl = `https://rumble.com/embed/${id}/`;
+ return { id, urlCanonique, embedUrl };
+ } catch {
+ return null;
+ }
+}
+
+/* ---------------------- Parsing robuste d’une PAGE vidéo ------------------ */
+/**
+ * Source d’autorité pour le vrai ID: le JS inline:
+ * Rumble("play", {..., "video":"vXXXX", ...})
+ * On prend ensuite en fallback: (souvent /embed/vXXXX/)
+ * puis ou (contenant /vXXXX-...).
+ *
+ * NB: ce choix est basé sur l’observation publique: la valeur "video":"vXXXX"
+ * est exactement l’ID attendu par l’embed officiel.
+ */
+function extractVideoIdentity($) {
+ // 1) Script "Rumble('play', {... "video":"vXXXX" ...})"
+ // On évite d’exécuter quoi que ce soit; simple regex sur tout le HTML.
+ const html = $.html() || '';
+ let m = /Rumble\(\s*["']play["']\s*,\s*{[^}]*["']video["']\s*:\s*["'](v[0-9A-Za-z]+)["']/s.exec(html);
+ if (m && m[1]) {
+ const id = m[1];
+ return {
+ id,
+ embedUrl: `https://rumble.com/embed/${id}/`,
+ urlCanonique: `https://rumble.com/${id}`
+ };
+ }
+
+ // 2) og:video → .../embed/vXXXX/...
+ let embed = $('meta[property="og:video"]').attr('content')
+ || $('meta[name="twitter:player"]').attr('content');
+ if (embed) {
+ if (embed.startsWith('//')) embed = 'https:' + embed;
+ const mm = /\/embed\/(v[0-9A-Za-z]+)/.exec(embed);
+ if (mm) {
+ const id = mm[1];
+ return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
+ }
+ }
+
+ // 3) Canonical / og:url → .../vXXXX-...
+ let canon = $('link[rel="canonical"]').attr('href')
+ || $('meta[property="og:url"]').attr('content');
+ if (canon) {
+ const mm = /\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(canon);
+ if (mm) {
+ const id = mm[1];
+ return { id, embedUrl: `https://rumble.com/embed/${id}/`, urlCanonique: `https://rumble.com/${id}` };
+ }
+ }
+
+ return null;
+}
+
+/* -------------------------- Scraper d’une vidéo -------------------------- */
+async function scrapeRumbleVideo(videoIdOrUrl) {
+ try {
+ // Accepte /:videoId ou une URL complète.
+ let norm = normalizeRumbleId(videoIdOrUrl);
+ const fetchUrl = norm?.urlCanonique || `https://rumble.com/${videoIdOrUrl}`;
+ const html = await httpGet(fetchUrl);
+ const $ = cheerio.load(html);
+
+ // Identité fiable (id + embed + canonique)
+ let ident = extractVideoIdentity($);
+ if (!ident) {
+ // dernier recours: ré-essayer avec la page telle quelle si on est venu via /video/123
+ if (!norm?.id && norm?.urlCanonique) {
+ ident = extractVideoIdentity($);
+ }
+ }
+ if (!ident?.id) {
+ return { error: 'Unable to determine Rumble video ID', input: videoIdOrUrl };
+ }
+
+ // Métadonnées robustes
+ const title =
+ $('h1.video-title, .video-title h1').first().text().trim()
+ || $('meta[property="og:title"]').attr('content') || 'Untitled Video';
+
+ let thumbnail = $('meta[property="og:image"]').attr('content') || '';
+ if (thumbnail && thumbnail.startsWith('//')) thumbnail = 'https:' + thumbnail;
+
+ const uploaderName =
+ $('.media-by--a, .channel-name, a[href*="/c/"]').first().text().trim() || '';
+
+ const viewsText =
+ $('.rumbles-views, .video-views, .media-view-count, [data-view-count]').first().text().trim() || '';
+ const views = parseInt(viewsText.replace(/[^\d]/g, ''), 10) || 0;
+
+ const duration = parseInt($('meta[property="video:duration"]').attr('content') || '', 10) || 0;
+
+ const uploadedDate =
+ $('meta[property="article:published_time"]').attr('content')
+ || $('time[datetime]').attr('datetime') || '';
+
+ const description =
+ $('meta[property="og:description"]').attr('content')
+ || $('meta[name="description"]').attr('content') || '';
+
+ // embedUrl final — toujours la forme officielle
+ const embedUrl = ident.embedUrl;
+
+ return {
+ videoId: ident.id,
+ title,
+ thumbnail,
+ uploaderName,
+ views,
+ duration,
+ uploadedDate,
+ description,
+ url: ident.urlCanonique,
+ embedUrl,
+ type: 'video'
+ };
+ } catch (e) {
+ const msg = (e && e.message) ? e.message : String(e);
+ return { error: `Scraping failed: ${msg}` };
+ }
+}
+
+/* ------------------ Scraper de liste (search / browse) ------------------ */
+function parseDurationToSeconds(text) {
+ if (!text) return 0;
+ // supporte mm:ss ou hh:mm:ss
+ const m = text.trim().match(/^(\d{1,2}):(\d{2})(?::(\d{2}))?$/);
+ if (!m) return 0;
+ const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
+ return h * 3600 + mn * 60 + s;
}
async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
@@ -92,54 +235,113 @@ async function scrapeRumbleList({ q, page = 1, limit = 24, sort = 'viral' }) {
const url = q
? `https://rumble.com/search/video?q=${encodeURIComponent(q)}&page=${page}`
: `https://rumble.com/videos?sort=${encodeURIComponent(sort)}&page=${page}`;
+
const html = await httpGet(url);
const $ = cheerio.load(html);
- const items = [];
- // Try to select video cards; Rumble uses different layouts, so search broadly
+
+ const found = [];
+ // 1) Cartes "vidéos" standards (li/article/div)
$('a[href^="/v"], a[href^="/video/"]').each((_, el) => {
- const a = $(el);
- const href = a.attr('href') || '';
- // Expect href like /vabcdef or /video/abcdef
- const m = href.match(/\/v([A-Za-z0-9]+)/) || href.match(/\/video\/([A-Za-z0-9]+)/);
- if (!m) return;
- const vid = `v${m[1]}`;
- const title = a.attr('title') || a.text().trim();
- // Look around for thumbnail and meta
- const parent = a.closest('li, article, div');
- const img = parent.find('img').first();
- let thumb = img.attr('data-src') || img.attr('src') || '';
+ const href = $(el).attr('href') || '';
+ // On préfère STRICTEMENT l’ID /vXXXX
+ let m = /^\/(v[0-9A-Za-z]+)(?:[-/.]|$)/.exec(href);
+ let id = m?.[1] || null;
+
+ // Fallback minimaliste pour /video/123 → on ne convertit pas ici; on laissera /video/:id passer au détails qui normalise par parse de la page.
+ const isLegacy = !id && /^\/video\//.test(href);
+
+ if (!id && !isLegacy) return;
+
+ const card = $(el).closest('li, article, .video-listing-entry, .video-item, .video-card, div');
+
+ const title = (($(el).attr('title') || '') + ' ' + $(el).text()).trim() || card.find('h3, h2, .video-item--title').first().text().trim();
+
+ // Thumb robuste: data-src > src
+ let thumb =
+ card.find('img').first().attr('data-src')
+ || card.find('img').first().attr('src')
+ || '';
if (thumb && thumb.startsWith('//')) thumb = 'https:' + thumb;
- const durationText = parent.find('.video-item--duration, .video-duration, .duration').first().text().trim();
- const viewsText = parent.find('.video-item--views, .rumbles-views, .views').first().text().trim();
- const duration = (() => {
- const m = durationText.match(/(\d+):(\d+)(?::(\d+))?/);
- if (!m) return 0;
- const h = parseInt(m[3] || '0', 10), mn = parseInt(m[1] || '0', 10), s = parseInt(m[2] || '0', 10);
- return h * 3600 + mn * 60 + s;
- })();
- const views = parseInt((viewsText || '').replace(/[^0-9]/g, '')) || 0;
- items.push({ videoId: vid, title, thumbnail: thumb, uploaderName: '', views, duration, uploadedDate: '', url: `https://rumble.com/${vid}`, type: 'video' });
+
+ const durationText =
+ card.find('.video-item--duration, .video-duration, .duration, .video-item__duration').first().text().trim();
+ const viewsText =
+ card.find('.video-item--views, .rumbles-views, .views, .video-item__views').first().text().trim();
+
+ const duration = parseDurationToSeconds(durationText);
+ const views = parseInt((viewsText || '').replace(/[^\d]/g, ''), 10) || 0;
+
+ // Important: on renvoie TOUJOURS une URL canonique cohérente
+ let url = null;
+ let videoId = null;
+
+ if (id) {
+ videoId = id;
+ url = `https://rumble.com/${id}`;
+ } else if (isLegacy) {
+ // Laisse l’endpoint /video/:slug gérer la normalisation
+ videoId = href.replace(/^\//, ''); // "video/123..."
+ url = `https://rumble.com/${videoId}`;
+ }
+
+ // Filtrage doublons par videoId (id ou "video/123...")
+ const key = videoId;
+ found.push({
+ videoId: key,
+ title,
+ thumbnail: thumb,
+ uploaderName: '',
+ views,
+ duration,
+ uploadedDate: '',
+ url,
+ type: 'video'
+ });
});
- // De-duplicate by videoId and slice to limit
+
+ // De-dupe
const seen = new Set();
const unique = [];
- for (const it of items) { if (!seen.has(it.videoId)) { seen.add(it.videoId); unique.push(it); } }
+ for (const it of found) {
+ if (!it.videoId) continue;
+ if (seen.has(it.videoId)) continue;
+ seen.add(it.videoId);
+ unique.push(it);
+ }
+
+ // Limite + nextCursor (page-based)
const list = unique.slice(0, limit);
const nextCursor = list.length === limit ? String(Number(page) + 1) : null;
- return { items: list, total: unique.length, page: Number(page), limit: Number(limit), nextCursor };
+
+ return {
+ items: list,
+ total: unique.length,
+ page: Number(page),
+ limit: Number(limit),
+ nextCursor
+ };
} catch (e) {
- console.error('scrapeRumbleList error:', e.message);
- return { items: [], total: 0, page: Number(page), limit: Number(limit), nextCursor: null };
+ return {
+ items: [],
+ total: 0,
+ page: Number(page),
+ limit: Number(limit),
+ nextCursor: null,
+ error: (e && e.message) ? e.message : String(e)
+ };
}
}
+/* --------------------------------- Routes -------------------------------- */
router.get('/browse', async (req, res) => {
- const page = parseInt(String(req.query.page || '1'), 10) || 1;
- const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24);
+ const page = Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
+ const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const sort = String(req.query.sort || 'viral');
+
const key = cacheKey('/browse', { page, limit, sort });
const cached = getCache(key);
if (cached) return res.json(cached);
+
const data = await scrapeRumbleList({ page, limit, sort });
setCache(key, data);
return res.json(data);
@@ -148,37 +350,67 @@ router.get('/browse', async (req, res) => {
router.get('/search', async (req, res) => {
const q = String(req.query.q || '').trim();
if (!q) return res.status(400).json({ error: 'Query parameter required' });
- const limit = Math.min(50, parseInt(String(req.query.limit || '24'), 10) || 24);
+
+ const limit = Math.min(50, Math.max(1, parseInt(String(req.query.limit || '24'), 10) || 24));
const page = (() => {
- // Support offset-based cursor from frontend by translating offset->page
if (req.query.offset != null) {
const offset = parseInt(String(req.query.offset), 10) || 0;
return Math.floor(offset / limit) + 1;
}
- return parseInt(String(req.query.page || '1'), 10) || 1;
+ return Math.max(1, parseInt(String(req.query.page || '1'), 10) || 1);
})();
+
const key = cacheKey('/search', { q, page, limit });
const cached = getCache(key);
if (cached) return res.json(cached);
+
const data = await scrapeRumbleList({ q, page, limit });
setCache(key, data);
return res.json(data);
});
-router.get('/video/:videoId', async (req, res) => {
+// Endpoint details. Accepte :videoId pouvant être "vXXXX" OU "video/123..."
+router.get('/video/:videoId(*)', async (req, res) => {
try {
- const { videoId } = req.params;
- const key = cacheKey('/video', { videoId });
+ const raw = String(req.params.videoId);
+ const key = cacheKey('/video', { videoId: raw });
const cached = getCache(key);
if (cached) return res.json(cached);
- const videoData = await scrapeRumbleVideo(videoId);
- if (videoData.error) return res.status(404).json({ error: 'Video not found or scraping failed' });
- setCache(key, videoData);
- return res.json(videoData);
+
+ // Normalise au maximum avant scrape
+ const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
+ const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
+ if (data.error) return res.status(404).json(data);
+
+ setCache(key, data);
+ return res.json(data);
} catch (error) {
- console.error('Rumble video error:', error);
return res.status(500).json({ error: 'Failed to scrape video' });
}
});
+/* ----------------- Option: “prélecteur” sans pub (non-embed) -------------- */
+/**
+ * On NE désactive PAS les pubs côté Rumble (pas de param officiel fiable).
+ * Mais on peut servir un “preplay”:
+ * - On affiche miniature/titre.
+ * - Au clic: (A) ouvrir dans Rumble (UX la plus propre), ou (B) injecter l’iframe
+ * officiellement (ce qui déclenchera leur logique pub).
+ * Cette route renvoie juste les meta nécessaires pour ce composant prélecteur.
+ */
+router.get('/video/:videoId/preplay', async (req, res) => {
+ const raw = String(req.params.videoId);
+ const norm = normalizeRumbleId(raw) || { urlCanonique: `https://rumble.com/${raw}` };
+ const data = await scrapeRumbleVideo(norm.id || norm.urlCanonique);
+ if (data.error) return res.status(404).json(data);
+ const preplay = {
+ videoId: data.videoId,
+ title: data.title,
+ thumbnail: data.thumbnail,
+ rumbleUrl: data.url, // bouton "Ouvrir sur le site du fournisseur"
+ embedUrl: data.embedUrl // injection différée si l’utilisateur insiste pour lire ici
+ };
+ res.json(preplay);
+});
+
export default router;