Storing actor avatars. Using 1 second interval queue for location resolve as per OSM code of conduct.

This commit is contained in:
2020-05-16 04:36:45 +02:00
parent 21d4dd6bfa
commit 05ee57378a
28 changed files with 899 additions and 751 deletions

View File

@@ -13,6 +13,7 @@ const logger = require('./logger')(__filename);
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
const resolvePlace = require('./utils/resolve-place');
const { associateAvatars } = require('./media');
const { toBaseReleases } = require('./deep');
@@ -80,6 +81,7 @@ function curateProfileEntry(profile) {
has_piercings: profile.hasPiercings,
piercings: profile.piercings,
tattoos: profile.tattoos,
avatar_media_id: profile.avatarMediaId || null,
};
return curatedProfileEntry;
@@ -91,6 +93,7 @@ async function curateProfile(profile) {
id: profile.id,
name: profile.name,
avatar: profile.avatar,
scraper: profile.scraper,
};
curatedProfile.site = profile.site.isNetwork ? null : profile.site;
@@ -198,6 +201,7 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
return {
...actor,
...profile,
scraper: scraperSlug,
site: siteOrNetwork,
};
}), Promise.reject(new Error()));
@@ -256,16 +260,16 @@ async function upsertProfiles(curatedProfileEntries) {
}
if (argv.force && updatingProfileEntries.length > 0) {
knex.transaction(async (transaction) => {
const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles')
.where('id', profileEntry.id)
.update(profileEntry)
.transacting(transaction));
const transaction = await knex.transaction();
const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles')
.where('id', profileEntry.id)
.update(profileEntry)
.returning(['id', 'actor_id'])
.transacting(transaction));
return Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
});
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
}
}
@@ -305,7 +309,9 @@ async function scrapeActors(actorNames) {
);
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
const profilesWithAvatarIds = await associateAvatars(profiles);
const curatedProfileEntries = profilesWithAvatarIds.map(profile => curateProfileEntry(profile));
await upsertProfiles(curatedProfileEntries);
}
@@ -350,7 +356,7 @@ async function associateActors(releases, batchId) {
const baseActors = Object.values(baseActorsByReleaseId).flat();
if (baseActors.length === 0) {
return;
return null;
}
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
@@ -382,6 +388,8 @@ async function associateActors(releases, batchId) {
.flat();
await knex.raw(`${knex('releases_actors').insert(releaseActorAssociations).toString()} ON CONFLICT DO NOTHING;`);
return actors;
}
module.exports = {

View File

@@ -106,10 +106,11 @@ function toBaseSource(rawSource) {
return null;
}
function baseSourceToBaseMedia(baseSource, role) {
function baseSourceToBaseMedia(baseSource, role, metadata) {
if (Array.isArray(baseSource)) {
if (baseSource.length > 0) {
return {
...metadata,
id: nanoid(),
role,
sources: baseSource,
@@ -121,6 +122,7 @@ function baseSourceToBaseMedia(baseSource, role) {
if (baseSource) {
return {
...metadata,
id: nanoid(),
role,
sources: [baseSource],
@@ -130,15 +132,15 @@ function baseSourceToBaseMedia(baseSource, role) {
return null;
}
function fallbackMediaToBaseMedia(rawMedia, role) {
function fallbackMediaToBaseMedia(rawMedia, role, metadata) {
const baseSources = rawMedia
.map(source => toBaseSource(source))
.filter(Boolean);
return baseSourceToBaseMedia(baseSources, role);
return baseSourceToBaseMedia(baseSources, role, metadata);
}
function toBaseMedias(rawMedias, role) {
function toBaseMedias(rawMedias, role, metadata) {
if (!rawMedias || rawMedias.length === 0) {
return [];
}
@@ -150,12 +152,12 @@ function toBaseMedias(rawMedias, role) {
if (Array.isArray(rawMedia)) {
// fallback sources provided
return fallbackMediaToBaseMedia(rawMedia, role);
return fallbackMediaToBaseMedia(rawMedia, role, metadata);
}
const baseSource = toBaseSource(rawMedia);
return baseSourceToBaseMedia(baseSource, role);
return baseSourceToBaseMedia(baseSource, role, metadata);
}).filter(Boolean);
const sampledBaseMedias = sampleMedias(baseMedias);
@@ -604,6 +606,44 @@ async function associateReleaseMedia(releases) {
}, Promise.resolve());
}
async function associateAvatars(profiles) {
if (!argv.media) {
return profiles;
}
const profilesWithBaseMedias = profiles.map(profile => (profile.avatar
? {
...profile,
avatarBaseMedia: toBaseMedias([profile.avatar], 'avatars', {
copyright: profile.network?.name || null,
scraper: profile.scraper || null,
})[0],
}
: profile
));
const baseMedias = profilesWithBaseMedias.map(profile => profile.avatarBaseMedia).filter(Boolean);
const storedMedias = await storeMedias(baseMedias);
const storedMediasById = itemsByKey(storedMedias, 'id');
const profilesWithAvatarIds = profilesWithBaseMedias.map((profile) => {
const media = storedMediasById[profile.avatarBaseMedia?.id];
if (media) {
return {
...profile,
avatarMediaId: media.use || media.entry.id,
};
}
return profile;
});
return profilesWithAvatarIds;
}
module.exports = {
associateAvatars,
associateReleaseMedia,
};

View File

@@ -365,8 +365,8 @@ function scrapeApiProfile(data, releases, siteSlug) {
if (data.attributes.eye_color) profile.eyes = data.attributes.eye_color;
if (data.attributes.hair_color) profile.hair = data.attributes.hair_color;
const avatarPath = Object.values(data.pictures).reverse()[0];
if (avatarPath) profile.avatar = `https://images01-evilangel.gammacdn.com/actors${avatarPath}`;
const avatarPaths = Object.values(data.pictures).reverse();
if (avatarPaths.length > 0) profile.avatar = avatarPaths.map(avatarPath => `https://images01-evilangel.gammacdn.com/actors${avatarPath}`);
profile.releases = releases.map(release => `https://${siteSlug}.com/en/video/${release.url_title}/${release.clip_id}`);

View File

@@ -342,8 +342,6 @@ function scrapeProfile(html, url, actorName) {
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href);
console.log(profile);
return profile;
}

View File

@@ -5,7 +5,7 @@ const config = require('config');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const slugify = require('./utils/slugify');
const { associateActors } = require('./actors');
const { associateActors, scrapeActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateSite } = require('./sites');
const { associateReleaseMedia } = require('./media');
@@ -226,18 +226,19 @@ async function storeReleases(releases) {
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
await Promise.all([
const [actors] = await Promise.all([
associateActors(releasesWithId, batchId),
associateReleaseTags(releasesWithId),
]);
await updateReleasesSearch(releasesWithId.map(release => release.id));
// media is more error-prone, associate separately
await associateReleaseMedia(releasesWithId);
await scrapeActors(actors.map(actor => actor.name));
logger.info(`Stored ${storedReleaseEntries.length} releases`);
await updateReleasesSearch(releasesWithId.map(release => release.id));
return releasesWithId;
}

View File

@@ -36,17 +36,13 @@ function useProxy(url) {
const queue = taskQueue();
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
queue.define('http', async ({
async function handler({
url,
method = 'GET',
body,
headers = {},
options = {},
}) => {
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)}`);
} else {
@@ -55,12 +51,12 @@ queue.define('http', async ({
const reqOptions = {
headers: {
...(options.defaultHeaders !== false && defaultHeaders),
...(options?.defaultHeaders !== false && defaultHeaders),
...headers,
},
...defaultOptions,
...options,
...(options.timeout && { responseTimeout: options.timeout }),
...(options?.timeout && { responseTimeout: options?.timeout }),
};
if (useProxy(url)) {
@@ -71,8 +67,8 @@ queue.define('http', async ({
? await bhttp[method.toLowerCase()](url, body, reqOptions)
: await bhttp[method.toLowerCase()](url, reqOptions);
if (options.stream && options.destination) {
await pipeline(res, ...(options.transforms || []), options.destination);
if (options?.stream && options?.destination) {
await pipeline(res, ...(options?.transforms || []), options?.destination);
}
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
@@ -88,12 +84,22 @@ queue.define('http', async ({
code: res.statusCode,
status: res.statusCode,
};
}, {
}
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
queue.define('20p', handler, {
concurrency: 20,
});
async function get(url, headers, options) {
return queue.push('http', {
queue.define('1s', handler, {
interval: 1,
});
async function get(url, headers, options, queueMethod = '20p') {
return queue.push(queueMethod, {
method: 'GET',
url,
headers,
@@ -101,8 +107,8 @@ async function get(url, headers, options) {
});
}
async function post(url, body, headers, options) {
return queue.push('http', {
async function post(url, body, headers, options, queueMethod = '20p') {
return queue.push(queueMethod, {
method: 'POST',
url,
body,

View File

@@ -12,7 +12,7 @@ async function resolvePlace(query) {
// https://operations.osmfoundation.org/policies/nominatim/
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
'User-Agent': 'contact at moonloop.adult@protonmail.com',
});
}, null, '1s');
const [item] = res.body;

View File

@@ -12,7 +12,7 @@ const schemaExtender = makeExtendSchemaPlugin(_build => ({
}
extend type Actor {
age: Int @requires(columns: ["birthdate"])
age: Int @requires(columns: ["date_of_birth"])
height(units:Units): String @requires(columns: ["height"])
weight(units:Units): String @requires(columns: ["weight"])
}