From 3ec7b1588661714199f60fc0e3a6953c823c8073 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 19 Nov 2019 04:36:15 +0100 Subject: [PATCH] Improved scraping and association behavior. --- assets/components/actor/actor.vue | 9 +++- migrations/20190325001339_releases.js | 10 ++++ src/actors.js | 78 +++++++++++++++++++-------- src/app.js | 11 +++- src/media.js | 40 +++++++------- src/releases.js | 72 +++++++++++++------------ src/scrape-release.js | 6 ++- src/scrape-sites.js | 5 +- src/scrapers/freeones.js | 21 +++++--- src/tags.js | 8 +-- 10 files changed, 166 insertions(+), 94 deletions(-) diff --git a/assets/components/actor/actor.vue b/assets/components/actor/actor.vue index 0cb23c53..c3442704 100644 --- a/assets/components/actor/actor.vue +++ b/assets/components/actor/actor.vue @@ -20,6 +20,11 @@ {{ actor.aliases.join(', ') }} +
  • + Gender + {{ actor.gender }} +
  • +
  • Date of birth {{ formatDate(actor.birthdate, 'MMMM D, YYYY') }} ({{ age }}) @@ -51,7 +56,7 @@ {{ actor.residencePlace }}
  • -
  • +
  • Ethnicity {{ actor.ethnicity }}
  • @@ -61,7 +66,7 @@ {{ actor.height }} -
  • +
  • Boobs {{ actor.boobSize }} {{ actor.boobsNatural ? 'Natural' : 'Enhanced' }} diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index 5ff4ff1c..297d82a9 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -53,6 +53,9 @@ exports.up = knex => Promise.resolve() table.datetime('created_at') .defaultTo(knex.fn.now()); + + table.datetime('scraped_at'); + table.boolean('scrape_success'); })) .then(() => knex.schema.createTable('directors', (table) => { table.increments('id', 12); @@ -229,6 +232,8 @@ exports.up = knex => Promise.resolve() .notNullable() .references('id') .inTable('actors'); + + table.unique(['release_id', 'actor_id']); })) .then(() => knex.schema.createTable('directors_associated', (table) => { table.increments('id', 16); @@ -242,6 +247,8 @@ exports.up = knex => Promise.resolve() .notNullable() .references('id') .inTable('directors'); + + table.unique(['release_id', 'director_id']); })) .then(() => knex.schema.createTable('tags_associated', (table) => { table.integer('tag_id', 12) @@ -256,6 +263,9 @@ exports.up = knex => Promise.resolve() table.integer('release_id', 16) .references('id') .inTable('releases'); + + table.unique(['release_id', 'tag_id']); + table.unique(['site_id', 'tag_id']); })); exports.down = knex => Promise.resolve() diff --git a/src/actors.js b/src/actors.js index 3f20eaec..319e65d8 100644 --- a/src/actors.js +++ b/src/actors.js @@ -12,6 +12,7 @@ async function curateActor(actor) { return { id: actor.id, + gender: actor.gender, name: actor.name, description: actor.description, birthdate: actor.birthdate && new Date(actor.birthdate), @@ -43,10 +44,13 @@ function curateActors(releases) { return Promise.all(releases.map(async release => curateActor(release))); } -function curateScrapedActor(actor) { - return { +function curateActorEntry(actor, scraped, scrapeSuccess) { + const curatedActor = { id: actor.id, - name: actor.name, + name: actor.name + .split(' ') + .map(segment => `${segment.charAt(0).toUpperCase()}${segment.slice(1)}`) + .join(' '), slug: actor.name.toLowerCase().replace(/\s+/g, '-'), birthdate: actor.birthdate, description: actor.description, @@ -65,6 +69,16 @@ function curateScrapedActor(actor) { tattoos: actor.tattoos, piercings: actor.piercings, }; + + if (scraped) { + return { + ...curatedActor, + scraped_at: new Date(), + scrape_success: scrapeSuccess, + }; + } + + return curatedActor; } async function fetchActors(queryObject) { @@ -82,8 +96,8 @@ async function fetchActors(queryObject) { return curateActors(releases); } -async function storeActor(actor) { - const curatedActor = curateScrapedActor(actor); +async function storeActor(actor, scraped = false, scrapeSuccess = false) { + const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); const actorEntries = await knex('actors') .insert(curatedActor) @@ -102,8 +116,8 @@ async function storeActor(actor) { return null; } -async function updateActor(actorEntry, actor) { - const curatedActor = curateScrapedActor(actor); +async function updateActor(actorEntry, actor, scraped = false, scrapeSuccess = false) { + const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); const actorEntries = await knex('actors') .where({ id: actorEntry.id }) @@ -117,39 +131,59 @@ async function updateActor(actorEntry, actor) { async function scrapeActors(actorNames) { await Promise.map(actorNames || argv.actors, async (actorName) => { - const [actorEntry] = await fetchActors({ name: actorName }); - const profiles = await Promise.all(Object.values(scrapers.actors).map(scraper => scraper.fetchActor(actorName))); + const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-'); - if (actorEntry) { - return updateActor(actorEntry, profiles[0]); + const [actorEntry] = await fetchActors({ slug: actorSlug }); + const profiles = await Promise.all(Object.values(scrapers.actors).map(scraper => scraper.fetchActor(actorEntry ? actorEntry.name : actorName))); + + if (profiles[0] === null) { + console.log(`Could not find profile for actor '${actorName}'`); + return updateActor(actorEntry, actorEntry, true, false); } - return storeActor(profiles[0]); + if (actorEntry && profiles[0]) { + return updateActor(actorEntry, profiles[0], true, true); + } + + return storeActor(profiles[0], true, true); }, { - concurrency: 5, + concurrency: 1, }); } -async function storeActors(release, releaseEntry) { +async function scrapeBasicActors() { + const basicActors = await knex('actors').where('scraped_at', null); + + return scrapeActors(basicActors.map(actor => actor.name)); +} + +async function associateActors(release, releaseId) { const actorEntries = await knex('actors').whereIn('name', release.actors); const newActors = release.actors .map(actorName => actorName.trim()) .filter(actorName => !actorEntries.some(actor => actor.name === actorName)); - const newActorEntries = await Promise.all(newActors.map(async actorName => storeActor({ name: actorName }))); + const [newActorEntries, associatedActors] = await Promise.all([ + Promise.all(newActors.map(async actorName => storeActor({ name: actorName }))), + knex('actors_associated').where('release_id', releaseId), + ]); + + const newlyAssociatedActors = actorEntries + .concat(newActorEntries) + .filter(actorEntry => !associatedActors.some(actor => actorEntry.id === actor.id)) + .map(actor => ({ + release_id: releaseId, + actor_id: actor.id, + })); await knex('actors_associated') - .insert(actorEntries.concat(newActorEntries).map(actor => ({ - release_id: releaseEntry.id, - actor_id: actor.id, - })), '*'); - - scrapeActors(newActorEntries.map(actor => actor.name)); + .insert(newlyAssociatedActors); } module.exports = { + associateActors, fetchActors, scrapeActors, - storeActors, + scrapeBasicActors, }; diff --git a/src/app.js b/src/app.js index 3d8442da..d43224bb 100644 --- a/src/app.js +++ b/src/app.js @@ -6,7 +6,7 @@ const initServer = require('./web/server'); const scrapeSites = require('./scrape-sites'); const scrapeRelease = require('./scrape-release'); -const { scrapeActors } = require('./actors'); +const { scrapeActors, scrapeBasicActors } = require('./actors'); async function init() { if (argv.url) { @@ -24,13 +24,20 @@ async function init() { return; } - if (argv.actors) { + if (argv.actors && argv.actors.length > 0) { await scrapeActors(); knex.destroy(); return; } + if (argv.actors) { + await scrapeBasicActors(); + knex.destroy(); + + return; + } + await initServer(); } diff --git a/src/media.js b/src/media.js index e75c1693..9fe243a0 100644 --- a/src/media.js +++ b/src/media.js @@ -37,13 +37,13 @@ async function createMediaDirectory(release, releaseId) { } } -async function storePoster(release, releaseEntry) { +async function storePoster(release, releaseId) { if (!release.poster) { - console.warn(`No poster available for (${release.site.name}, ${releaseEntry.id}}) "${release.title}"`); + console.warn(`No poster available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } - console.log(`Storing poster for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`); + console.log(`Storing poster for (${release.site.name}, ${releaseId}) "${release.title}"`); const res = await bhttp.get(release.poster); const thumbnail = await getThumbnail(res.body); @@ -53,8 +53,8 @@ async function storePoster(release, releaseEntry) { const mimetype = res.headers['content-type'] || mime.getType(pathname) || 'image/jpeg'; const extension = mime.getExtension(mimetype); - const filepath = path.join(release.site.network.slug, release.site.slug, releaseEntry.id.toString(), `poster.${extension}`); - const thumbpath = path.join(release.site.network.slug, release.site.slug, releaseEntry.id.toString(), `poster_thumb.${extension}`); + const filepath = path.join(release.site.network.slug, release.site.slug, releaseId.toString(), `poster.${extension}`); + const thumbpath = path.join(release.site.network.slug, release.site.slug, releaseId.toString(), `poster_thumb.${extension}`); const hash = getHash(res.body); await Promise.all([ @@ -69,23 +69,23 @@ async function storePoster(release, releaseEntry) { hash, source: release.poster, domain: 'releases', - target_id: releaseEntry.id, + target_id: releaseId, role: 'poster', }); return; } - console.warn(`Failed to store poster for (${release.site.name}, ${releaseEntry.id}) "${release.title}": ${res.statusCode}`); + console.warn(`Failed to store poster for (${release.site.name}, ${releaseId}) "${release.title}": ${res.statusCode}`); } -async function storePhotos(release, releaseEntry) { - if (release.photos.length === 0) { - console.warn(`No photos available for (${release.site.name}, ${releaseEntry.id}}) "${release.title}"`); +async function storePhotos(release, releaseId) { + if (!release.photos || release.photos.length === 0) { + console.warn(`No photos available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } - console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`); + console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); const files = await Promise.map(release.photos, async (photoUrl, index) => { const { pathname } = new URL(photoUrl); @@ -98,8 +98,8 @@ async function storePhotos(release, releaseEntry) { if (res.statusCode === 200) { const extension = mime.getExtension(mimetype); - const filepath = path.join(release.site.network.slug, release.site.slug, releaseEntry.id.toString(), `${index + 1}.${extension}`); - const thumbpath = path.join(release.site.network.slug, release.site.slug, releaseEntry.id.toString(), `${index + 1}_thumb.${extension}`); + const filepath = path.join(release.site.network.slug, release.site.slug, releaseId.toString(), `${index + 1}.${extension}`); + const thumbpath = path.join(release.site.network.slug, release.site.slug, releaseId.toString(), `${index + 1}_thumb.${extension}`); const hash = getHash(res.body); await Promise.all([ @@ -118,7 +118,7 @@ async function storePhotos(release, releaseEntry) { throw new Error(`Response ${res.statusCode} not OK`); } catch (error) { - console.warn(`Failed to store photo ${index + 1} for "${release.title}" (${photoUrl}, ${release.url}, ${release.site.name}, ${releaseEntry.id}): ${error}`); + console.warn(`Failed to store photo ${index + 1} for "${release.title}" (${photoUrl}, ${release.url}, ${release.site.name}, ${releaseId}): ${error}`); return null; } @@ -136,24 +136,24 @@ async function storePhotos(release, releaseEntry) { source: file.source, index, domain: 'releases', - target_id: releaseEntry.id, + target_id: releaseId, role: 'photo', }))); } -async function storeTrailer(release, releaseEntry) { +async function storeTrailer(release, releaseId) { if (!release.trailer || !release.trailer.src) { - console.warn(`No trailer available for (${release.site.name}, ${releaseEntry.id}}) "${release.title}"`); + console.warn(`No trailer available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } - console.log(`Storing trailer for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`); + console.log(`Storing trailer for (${release.site.name}, ${releaseId}) "${release.title}"`); const { pathname } = new URL(release.trailer.src); const mimetype = release.trailer.type || mime.getType(pathname); const res = await bhttp.get(release.trailer.src); - const filepath = path.join(release.site.network.slug, release.site.slug, releaseEntry.id.toString(), `trailer${release.trailer.quality ? `_${release.trailer.quality}` : ''}.${mime.getExtension(mimetype)}`); + const filepath = path.join(release.site.network.slug, release.site.slug, releaseId.toString(), `trailer${release.trailer.quality ? `_${release.trailer.quality}` : ''}.${mime.getExtension(mimetype)}`); await Promise.all([ fs.writeFile(path.join(config.media.path, filepath), res.body), @@ -162,7 +162,7 @@ async function storeTrailer(release, releaseEntry) { mime: mimetype, source: release.trailer.src, domain: 'releases', - target_id: releaseEntry.id, + target_id: releaseId, role: 'trailer', quality: release.trailer.quality || null, }), diff --git a/src/releases.js b/src/releases.js index 925a7e6e..be7c3d21 100644 --- a/src/releases.js +++ b/src/releases.js @@ -4,8 +4,8 @@ const Promise = require('bluebird'); const knex = require('./knex'); const argv = require('./argv'); const whereOr = require('./utils/where-or'); -const { storeTags } = require('./tags'); -const { storeActors } = require('./actors'); +const { associateTags } = require('./tags'); +const { associateActors } = require('./actors'); const { createMediaDirectory, storePoster, @@ -141,15 +141,6 @@ async function fetchReleases(queryObject = {}, options = {}) { return curateReleases(releases); } -async function fetchReleasesByEntryIds(entryIds, queryObject = {}, options = {}) { - const releases = await knex('releases') - .modify(commonQuery, options) - .whereIn('entry_id', entryIds) - .andWhere(builder => whereOr(queryObject, 'releases', builder)); - - return curateReleases(releases); -} - async function fetchSiteReleases(queryObject, options = {}) { const releases = await knex('releases') .modify(commonQuery, options) @@ -192,41 +183,52 @@ async function fetchTagReleases(queryObject, options = {}) { return curateReleases(releases); } +async function storeReleaseAssets(release, releaseId) { + await createMediaDirectory(release, releaseId); + + await Promise.all([ + associateActors(release, releaseId), + associateTags(release, releaseId), + storePhotos(release, releaseId), + storePoster(release, releaseId), + storeTrailer(release, releaseId), + ]); +} + async function storeRelease(release) { + const existingRelease = await knex('releases').where('entry_id', release.entryId).first(); const curatedRelease = curateScrapedRelease(release); - const releaseEntries = await knex('releases') + if (existingRelease && !argv.redownload) { + return existingRelease.id; + } + + if (existingRelease && argv.redownload) { + const [updatedRelease] = await knex('releases') + .where('entry_id', existingRelease.id) + .update({ + ...existingRelease, + ...curatedRelease, + }) + .returning('*'); + + await storeReleaseAssets(release, existingRelease.id); + console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`); + + return updatedRelease || existingRelease; + } + + const [releaseEntry] = await knex('releases') .insert(curatedRelease) .returning('*'); - if (releaseEntries.length) { - const releaseEntry = releaseEntries[0]; - - console.log(`Stored (${release.site.name}, ${releaseEntry.id}) "${release.title}"`); - - await createMediaDirectory(release, releaseEntry.id); - - await Promise.all([ - storeActors(release, releaseEntry), - storeTags(release, releaseEntry), - storePhotos(release, releaseEntry), - storePoster(release, releaseEntry), - storeTrailer(release, releaseEntry), - ]); - - return releaseEntry.id; - } - - console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`); + await storeReleaseAssets(release, releaseEntry.id); + console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`); return null; } async function storeReleases(releases) { - const existingReleases = await fetchReleasesByEntryIds(releases.map(release => release.entryId)); - - console.log(existingReleases); - return Promise.map(releases, async (release) => { try { const releaseId = await storeRelease(release); diff --git a/src/scrape-release.js b/src/scrape-release.js index 81be87c4..db5f5453 100644 --- a/src/scrape-release.js +++ b/src/scrape-release.js @@ -7,6 +7,7 @@ const scrapers = require('./scrapers/scrapers'); const { storeReleases } = require('./releases'); const { findSiteByUrl } = require('./sites'); const { findNetworkByUrl } = require('./networks'); +const { scrapeBasicActors } = require('./actors'); async function findSite(url, release) { const site = (release && release.site) || await findSiteByUrl(url); @@ -48,7 +49,10 @@ async function scrapeRelease(url, release, deep = false) { if (!deep && argv.save) { // don't store release when called by site scraper - const releaseId = await storeReleases([scene]); + const releaseId = await Promise.all([ + storeReleases([scene]), + scrapeBasicActors(), + ]); console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`); } diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 410261de..961772b3 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -9,6 +9,7 @@ const { fetchIncludedSites } = require('./sites'); const scrapers = require('./scrapers/scrapers'); const scrapeRelease = require('./scrape-release'); const { storeReleases } = require('./releases'); +const { scrapeBasicActors } = require('./actors'); function getAfterDate() { return moment @@ -58,7 +59,7 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a async function scrapeUpcomingReleases(scraper, site) { if (scraper.fetchUpcoming) { - const upcomingReleases = scraper.fetchUpcoming(site); + const upcomingReleases = await scraper.fetchUpcoming(site); return upcomingReleases.map(release => ({ ...release, upcoming: true })); } @@ -131,6 +132,8 @@ async function scrapeReleases() { }, { concurrency: 2, }); + + await scrapeBasicActors(); } module.exports = scrapeReleases; diff --git a/src/scrapers/freeones.js b/src/scrapers/freeones.js index 6bdf1324..47f7f005 100644 --- a/src/scrapers/freeones.js +++ b/src/scrapers/freeones.js @@ -19,7 +19,9 @@ async function scrapeActorFrontpage(html, url, name) { const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {}); const birthdateString = bio['Date of Birth:']; - const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate(); + const birthdate = birthdateString && birthdateString !== 'Unknown (Add)' + ? moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate() + : null; const boobsSizeString = bio['Measurements:']; const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString; @@ -74,8 +76,9 @@ async function scrapeActorBio(html, frontpageBio, url, name) { const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {}); const birthdateString = bio['Date of Birth:']; - const birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate(); - const active = bio['Career Status:'].trim() === 'Active'; + const birthdate = birthdateString && birthdateString !== 'Unknown' + ? moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate() + : null; const boobsSizeString = bio['Measurements:']; const boobsSize = boobsSizeString === '??-??-??' ? null : boobsSizeString; @@ -114,7 +117,6 @@ async function scrapeActorBio(html, frontpageBio, url, name) { eyes, piercings, tattoos, - active, social, }; } @@ -124,11 +126,16 @@ async function fetchActor(actorName) { const frontpageUrl = `https://freeones.com/html/v_links/${slug}`; const resFrontpage = await bhttp.get(frontpageUrl); - const { url, bio } = await scrapeActorFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName); - const resBio = await bhttp.get(url); + if (resFrontpage.statusCode === 200) { + const { url, bio } = await scrapeActorFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName); - return scrapeActorBio(resBio.body.toString(), bio, url, actorName); + const resBio = await bhttp.get(url); + + return scrapeActorBio(resBio.body.toString(), bio, url, actorName); + } + + return null; } module.exports = { diff --git a/src/tags.js b/src/tags.js index 77d23490..ce0d4cab 100644 --- a/src/tags.js +++ b/src/tags.js @@ -24,15 +24,15 @@ function curateTags(tags) { return Promise.all(tags.map(async tag => curateTag(tag))); } -async function storeTags(release, releaseEntry) { +async function associateTags(release, releaseId) { if (!release.tags || release.tags.length === 0) { - console.warn(`No tags available for (${release.site.name}, ${releaseEntry.id}}) "${release.title}"`); + console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } await knex('tags_associated').insert(release.tags.map(tagId => ({ tag_id: tagId, - release_id: releaseEntry.id, + release_id: releaseId, }))); } @@ -74,7 +74,7 @@ async function matchTags(rawTags) { } module.exports = { - storeTags, + associateTags, fetchTags, matchTags, };