Added screen caps separate from photos. Added Tokyo Hot. Added hair type, shoe size and blood type actor fields.

This commit is contained in:
DebaucheryLibrarian
2023-07-25 03:03:41 +02:00
parent 6fe212796b
commit 693983dc29
32 changed files with 472 additions and 113 deletions

View File

@@ -106,6 +106,21 @@ const ethnicities = {
white: 'white',
};
const bloodTypes = {
A: 'A',
'A+': 'A+',
'A-': 'A-',
B: 'B',
'B+': 'B+',
'B-': 'B-',
AB: 'AB',
'AB+': 'AB+',
'AB-': 'AB-',
O: 'O',
'O+': 'O+',
'O-': 'O-',
};
function getBoolean(value) {
if (typeof value === 'boolean') {
return value;
@@ -195,6 +210,7 @@ function toBaseActors(actorsOrNames, release) {
name,
slug,
entryId: (entity && (entryId || actorOrName.entryId)) || null,
suppliedEntryId: entryId,
entity,
hasProfile: !!actorOrName.name, // actor contains profile information
};
@@ -257,12 +273,15 @@ function curateActor(actor, withDetails = false, isProfile = false) {
circumcised: actor.circumcised,
height: actor.height,
weight: actor.weight,
shoeSize: actor.shoe_size,
eyes: actor.eyes,
hairColor: actor.hair_color,
hairType: actor.hair_type,
hasTattoos: actor.has_tattoos,
hasPiercings: actor.has_piercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
bloodType: actor.blood_type,
...(isProfile && { description: actor.description }),
placeOfBirth: actor.birth_country && {
country: {
@@ -347,12 +366,15 @@ function curateProfileEntry(profile) {
natural_boobs: profile.naturalBoobs,
height: profile.height,
weight: profile.weight,
shoe_size: profile.shoeSize,
hair_color: profile.hairColor,
hair_type: profile.hairType,
eyes: profile.eyes,
has_tattoos: profile.hasTattoos,
has_piercings: profile.hasPiercings,
piercings: profile.piercings,
tattoos: profile.tattoos,
blood_type: profile.bloodType,
avatar_media_id: profile.avatarMediaId || null,
};
@@ -386,6 +408,7 @@ async function curateProfile(profile, actor) {
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
curatedProfile.ethnicity = ethnicities[profile.ethnicity?.trim().toLowerCase()] || null;
curatedProfile.hairType = profile.hairType?.trim() || null;
curatedProfile.hairColor = hairColors[(profile.hairColor || profile.hair)?.toLowerCase().replace('hair', '').trim()] || null;
curatedProfile.eyes = eyeColors[profile.eyes?.trim().toLowerCase()] || null;
@@ -411,6 +434,7 @@ async function curateProfile(profile, actor) {
curatedProfile.height = Number(profile.height) || profile.height?.match?.(/\d+/)?.[0] || null;
curatedProfile.weight = Number(profile.weight) || profile.weight?.match?.(/\d+/)?.[0] || null;
curatedProfile.shoeSize = Number(profile.shoeSize) || profile.shoeSize?.match?.(/\d+/)?.[0] || null;
// separate measurement values
curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match?.(/[a-zA-Z]+/)?.[0]) || null;
@@ -435,6 +459,7 @@ async function curateProfile(profile, actor) {
curatedProfile.naturalBoobs = getBoolean(profile.naturalBoobs);
curatedProfile.hasTattoos = getBoolean(profile.hasTattoos);
curatedProfile.hasPiercings = getBoolean(profile.hasPiercings);
curatedProfile.bloodType = bloodTypes[profile.bloodType?.trim().toUpperCase()] || null;
if (argv.resolvePlace) {
const [placeOfBirth, placeOfResidence] = await Promise.all([
@@ -564,6 +589,7 @@ async function interpolateProfiles(actorIdsOrNames) {
'bust',
'waist',
'hip',
'shoe_size',
'penis_length',
'penis_girth',
'circumcised',
@@ -571,6 +597,7 @@ async function interpolateProfiles(actorIdsOrNames) {
'eyes',
'has_tattoos',
'has_piercings',
'blood_type',
].reduce((acc, property) => ({
...acc,
[property]: getMostFrequent(valuesByProperty[property]),

View File

@@ -16,7 +16,8 @@ const logger = require('./logger')(__filename);
const knex = require('./knex');
const fetchUpdates = require('./updates');
const { fetchScenes, fetchMovies } = require('./deep');
const { storeScenes, storeMovies, updateSceneSearch, updateMovieSearch, associateMovieScenes } = require('./store-releases');
const { storeScenes, storeMovies, associateMovieScenes } = require('./store-releases');
const { updateSceneSearch, updateMovieSearch } = require('./update-search');
const { scrapeActors, deleteActors, flushActors, flushProfiles, interpolateProfiles } = require('./actors');
const { flushEntities } = require('./entities');
const { deleteScenes, deleteMovies, flushScenes, flushMovies, flushBatches } = require('./releases');

View File

@@ -226,6 +226,11 @@ const { argv } = yargs
type: 'boolean',
default: true,
})
.option('caps', {
describe: 'Include release screen caps',
type: 'boolean',
default: true,
})
.option('trailers', {
describe: 'Include release trailers',
type: 'boolean',

View File

@@ -567,7 +567,7 @@ async function storeFile(media, options) {
return storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath, options);
}
if (['posters', 'photos', 'covers'].includes(media.role)) {
if (['posters', 'photos', 'caps', 'covers'].includes(media.role)) {
throw new Error(`Media for '${media.role}' must be an image, but '${media.meta.mimetype}' was detected`);
}
@@ -873,6 +873,7 @@ async function associateReleaseMedia(releases, type = 'release') {
...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
...(argv.images && argv.covers ? toBaseMedias(release.covers, 'covers') : []),
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos') : []),
...(argv.images && argv.caps ? toBaseMedias(release.caps, 'caps') : []),
...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
],
@@ -888,7 +889,7 @@ async function associateReleaseMedia(releases, type = 'release') {
return acc;
}, {});
await Promise.reduce(['posters', 'covers', 'photos', 'teasers', 'trailers'], async (chain, role) => {
await Promise.reduce(['posters', 'covers', 'photos', 'caps', 'teasers', 'trailers'], async (chain, role) => {
// stage by role so posters are prioritized over photos and videos
await chain;
@@ -1006,6 +1007,7 @@ async function flushOrphanedMedia() {
knex('tags_photos').select('media_id'),
knex('releases_posters').select('media_id'),
knex('releases_photos').select('media_id'),
knex('releases_caps').select('media_id'),
knex('releases_covers').select('media_id'),
knex('releases_trailers').select('media_id'),
knex('releases_teasers').select('media_id'),

View File

@@ -5,6 +5,7 @@ const inquirer = require('inquirer');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const argv = require('./argv');
const { updateSceneSearch } = require('./update-search');
const { flushOrphanedMedia } = require('./media');
const { graphql } = require('./web/graphql');
@@ -303,6 +304,8 @@ async function deleteScenes(sceneIds) {
.whereRaw('id = ANY(:sceneIds)', { sceneIds })
.delete();
await updateSceneSearch(sceneIds);
logger.info(`Removed ${deleteCount}/${sceneIds.length} scenes`);
return deleteCount;

View File

@@ -61,6 +61,7 @@ const spizoo = require('./spizoo');
const teamskeet = require('./teamskeet');
const teencoreclub = require('./teencoreclub');
const teenmegaworld = require('./teenmegaworld');
const tokyohot = require('./tokyohot');
const topwebmodels = require('./topwebmodels');
const traxxx = require('./traxxx');
const vivid = require('./vivid');
@@ -151,6 +152,7 @@ const scrapers = {
teencoreclub,
teenmegaworld,
teamskeet,
tokyohot,
topwebmodels,
transbella: porndoe,
traxxx,
@@ -288,6 +290,7 @@ const scrapers = {
teencoreclub,
teenmegaworld,
thatsitcomshow: nubiles,
tokyohot,
topwebmodels,
transangels: mindgeek,
transbella: porndoe,

171
src/scrapers/tokyohot.js Normal file
View File

@@ -0,0 +1,171 @@
'use strict';
const unprint = require('unprint');
const slugify = require('../utils/slugify');
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
const pathname = query.url();
release.url = unprint.prefixUrl(pathname, channel.url);
release.entryId = pathname.match(/product\/(\w+)/)?.[1];
release.shootId = query.attribute('img', 'title');
release.title = query.content('.title')?.replace(/^tokyo hot\s*/i, '');
release.description = query.content('.text');
const poster = query.img();
release.poster = [
poster.replace('220x124', '820x462'),
poster,
];
return release;
});
}
function scrapeScene({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/product\/(\w+)/)?.[1];
release.shootId = query.content('//dt[contains(text(), "Product ID")]/following-sibling::dd[1]');
release.title = query.content('.contents h2');
release.description = query.content('.contents .sentence');
release.date = query.date('//dt[contains(text(), "Release Date")]/following-sibling::dd[1]', 'YYYY/MM/DD');
release.duration = query.duration('//dt[contains(text(), "Duration")]/following-sibling::dd[1]');
release.actors = query.all('.info a[href*="/cast"]').map((el) => ({
name: unprint.query.content(el),
url: unprint.query.url(el, null, { origin: channel.url }),
}));
release.tags = query.contents('.info a[href*="type=play"]');
const poster = query.poster('.movie video');
release.poster = [
poster,
poster.replace('820x462', '220x124'),
];
release.trailer = query.video('.movie source');
release.photos = query.imgs('.scap a', { attribute: 'href' }).map((img) => [
img,
img.replace('640x480_wlimited', '150x150_default'),
]);
release.caps = query.imgs('.vcap a', { attribute: 'href' }).map((img) => [
img,
img.replace('640x480_wlimited', '120x120_default'),
]);
return release;
}
// measurements are specified as a range in centimeters 85 ~ 89cm
function getMeasurement(string, inches = false) {
if (!string) {
return null;
}
const value = Array.from(string.matchAll(/(\d+(?:\.\d+)?)\s*cm/g)).at(-1)?.[1];
if (!value) {
return null;
}
if (inches) {
return Math.round(Number(value) * 0.393701);
}
return Number(value);
}
function scrapeProfile({ query }) {
const profile = {};
const keys = query.contents('.info dt');
const values = query.contents('.info dd');
const bio = Object.fromEntries(keys.map((key, index) => [slugify(key, '_'), values[index]]));
profile.birthPlace = bio.home_town;
profile.height = getMeasurement(bio.height);
profile.cup = bio.cup_size?.replace('cup', '').trim();
profile.bust = getMeasurement(bio.bust_size, true);
profile.waist = getMeasurement(bio.waist_size, true);
profile.hip = getMeasurement(bio.hip_size || bio.hip, true);
profile.hairStyle = bio.hair_style;
profile.shoeSize = getMeasurement(bio.shoes_size);
profile.bloodType = bio.blood_type.replace('type', '').trim();
profile.avatar = query.img('#profile img');
return profile;
}
async function fetchLatest(channel, page) {
const url = `${channel.url}/product/?vendor=Tokyo-Hot&page=${page}&order=published_at`;
const res = await unprint.get(url, {
selectAll: '#main .list .detail',
agent: {
rejectUnauthorized: false,
},
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await unprint.get(url, {
agent: {
rejectUnauthorized: false,
},
});
if (res.ok) {
return scrapeScene(res.context, url, channel);
}
return res.status;
}
async function fetchProfile(actor, context) {
if (!actor.url) {
// search is cumbersome
return null;
}
const res = await unprint.get(actor.url, {
agent: {
rejectUnauthorized: false,
},
});
if (res.ok) {
return scrapeProfile(res.context, context);
}
return res.status;
}
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
};

View File

@@ -16,6 +16,7 @@ const { associateActors, associateDirectors, scrapeActors, toBaseActors } = requ
const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media');
const { updateSceneSearch, updateMovieSearch } = require('./update-search');
const { notify } = require('./alerts');
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
@@ -229,50 +230,6 @@ async function filterDuplicateReleases(releases) {
};
}
async function updateSceneSearch(releaseIds) {
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const documents = await knex.raw(`
SELECT
releases.id AS release_id,
TO_TSVECTOR(
'english',
COALESCE(releases.title, '') || ' ' ||
releases.entry_id || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(releases.shoot_id, '') || ' ' ||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(directors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
) as document
FROM releases
LEFT JOIN entities ON releases.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 6
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY releases.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, releaseIds && [releaseIds]);
if (documents.rows?.length > 0) {
await bulkInsert('releases_search', documents.rows, ['release_id']);
}
await knex.raw('REFRESH MATERIALIZED VIEW releases_summaries;');
}
async function storeChapters(releases) {
const chapters = releases
.map((release) => release.chapters?.map((chapter, index) => ({
@@ -380,44 +337,6 @@ async function associateSerieScenes(series, serieScenes) {
await bulkInsert('series_scenes', associations, false);
}
async function updateMovieSearch(movieIds, target = 'movie') {
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
const documents = await knex.raw(`
SELECT
${target}s.id AS ${target}_id,
TO_TSVECTOR(
'english',
COALESCE(${target}s.title, '') || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(TO_CHAR(${target}s.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ')
) as document
FROM ${target}s
LEFT JOIN entities ON ${target}s.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN ${target}s_scenes ON ${target}s_scenes.${target}_id = ${target}s.id
LEFT JOIN releases ON releases.id = ${target}s_scenes.scene_id
LEFT JOIN releases_actors ON releases_actors.release_id = ${target}s_scenes.scene_id
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
LEFT JOIN actors ON actors.id = releases_actors.actor_id
LEFT JOIN tags ON tags.id = releases_tags.tag_id
${movieIds ? `WHERE ${target}s.id = ANY(?)` : ''}
GROUP BY ${target}s.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, movieIds && [movieIds]);
if (documents.rows?.length > 0) {
await bulkInsert(`${target}s_search`, documents.rows, [`${target}_id`]);
}
}
async function storeMovies(movies, useBatchId) {
if (!movies || movies.length === 0) {
return [];

92
src/update-search.js Normal file
View File

@@ -0,0 +1,92 @@
'use strict';
const knex = require('./knex');
const logger = require('./logger')(__filename);
const bulkInsert = require('./utils/bulk-insert');
async function updateSceneSearch(releaseIds) {
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const documents = await knex.raw(`
SELECT
releases.id AS release_id,
TO_TSVECTOR(
'english',
COALESCE(releases.title, '') || ' ' ||
releases.entry_id || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(releases.shoot_id, '') || ' ' ||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(directors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
) as document
FROM releases
LEFT JOIN entities ON releases.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 6
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY releases.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, releaseIds && [releaseIds]);
if (documents.rows?.length > 0) {
await bulkInsert('releases_search', documents.rows, ['release_id']);
}
await knex.raw('REFRESH MATERIALIZED VIEW releases_summaries;');
}
async function updateMovieSearch(movieIds, target = 'movie') {
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
const documents = await knex.raw(`
SELECT
${target}s.id AS ${target}_id,
TO_TSVECTOR(
'english',
COALESCE(${target}s.title, '') || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(TO_CHAR(${target}s.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ')
) as document
FROM ${target}s
LEFT JOIN entities ON ${target}s.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN ${target}s_scenes ON ${target}s_scenes.${target}_id = ${target}s.id
LEFT JOIN releases ON releases.id = ${target}s_scenes.scene_id
LEFT JOIN releases_actors ON releases_actors.release_id = ${target}s_scenes.scene_id
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
LEFT JOIN actors ON actors.id = releases_actors.actor_id
LEFT JOIN tags ON tags.id = releases_tags.tag_id
${movieIds ? `WHERE ${target}s.id = ANY(?)` : ''}
GROUP BY ${target}s.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, movieIds && [movieIds]);
if (documents.rows?.length > 0) {
await bulkInsert(`${target}s_search`, documents.rows, [`${target}_id`]);
}
}
module.exports = {
updateSceneSearch,
updateMovieSearch,
};