Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering.

2019-04-05 03:45:40 +02:00
parent cbb4fdc919
commit 2b818e379a
14 changed files with 99 additions and 49 deletions
--- a/src/scrapers/legalporno.js
+++ b/src/scrapers/legalporno.js
@@ -23,14 +23,15 @@ function scrapeLatest(html, site) {
        const sceneLinkElement = $(element).find('.thumbnail-title a');
        const url = sceneLinkElement.attr('href');

-        const originalTitle = sceneLinkElement.attr('title');
+        const originalTitle = sceneLinkElement.text().trim(); // title attribute breaks when they use \\ escaping
        const { shootId, title } = extractTitle(originalTitle);
+        const internalId = new URL(url).pathname.split('/')[2];

        const date = moment.utc($(element).attr('release'), 'YYYY/MM/DD').toDate();

        return {
            url,
-            shootId,
+            shootId: shootId || internalId,
            title,
            date,
            site,
@@ -68,8 +69,8 @@ async function scrapeScene(html, url, site) {
    };
 }

-async function fetchLatest(site) {
-    const res = await bhttp.get(`${site.url}/new-videos`);
+async function fetchLatest(site, page = 1) {
+    const res = await bhttp.get(`${site.url}/new-videos/${page}`);

    return scrapeLatest(res.body.toString(), site);
 }