Changed sites from argument query to group by network.
This commit is contained in:
parent
79465d9634
commit
1907ce1e54
Binary file not shown.
After Width: | Height: | Size: 1.4 MiB |
Binary file not shown.
After Width: | Height: | Size: 8.4 KiB |
Binary file not shown.
After Width: | Height: | Size: 36 KiB |
Binary file not shown.
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 36 KiB |
|
@ -437,14 +437,14 @@ const networks = [
|
||||||
|
|
||||||
exports.seed = knex => Promise.resolve()
|
exports.seed = knex => Promise.resolve()
|
||||||
.then(async () => {
|
.then(async () => {
|
||||||
const { inserted, updated } = await upsert('entities', parentNetworks, 'slug', knex);
|
const { inserted, updated } = await upsert('entities', parentNetworks.map(network => ({ ...network, type: 1 })), ['slug', 'type'], knex);
|
||||||
const parentNetworksBySlug = [].concat(inserted, updated).reduce((acc, network) => ({ ...acc, [network.slug]: network.id }), {});
|
const parentNetworksBySlug = [].concat(inserted, updated).reduce((acc, network) => ({ ...acc, [network.slug]: network.id }), {});
|
||||||
|
|
||||||
const networksWithParent = networks.map(network => ({
|
const networksWithParent = networks.map(network => ({
|
||||||
slug: network.slug,
|
slug: network.slug,
|
||||||
name: network.name,
|
name: network.name,
|
||||||
type: network.type || 1,
|
type: network.type || 1,
|
||||||
alias: (network.alias || []).join(','),
|
alias: network.alias ? network.alias.join(',') : null,
|
||||||
url: network.url,
|
url: network.url,
|
||||||
description: network.description,
|
description: network.description,
|
||||||
parameters: network.parameters,
|
parameters: network.parameters,
|
||||||
|
|
|
@ -596,7 +596,7 @@ const tagPosters = [
|
||||||
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'],
|
||||||
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'],
|
||||||
['blonde', 0, 'Anikka Albrite and Lena Nicole or Cherie DeVille in the BTS of "New Zealand Holiday" for In The Crack'],
|
['blonde', 0, 'Anikka Albrite and Lena Nicole or Cherie DeVille in the BTS of "New Zealand Holiday" for In The Crack'],
|
||||||
['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'],
|
['blowbang', 0, 'Lacy Lennon in "Lacy Lennon\'s First Blowbang" for HardX'],
|
||||||
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'],
|
||||||
['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'],
|
['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'],
|
||||||
['bukkake', 0, 'Jaye Summers in "Facialized 5" for HardX'],
|
['bukkake', 0, 'Jaye Summers in "Facialized 5" for HardX'],
|
||||||
|
@ -670,7 +670,8 @@ const tagPhotos = [
|
||||||
// ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
// ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
|
||||||
// ['anal', 0, 'Veronica Leal'],
|
// ['anal', 0, 'Veronica Leal'],
|
||||||
['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'],
|
['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'],
|
||||||
['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
|
['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'],
|
||||||
|
// ['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
|
||||||
['caucasian', 1, 'Sheena Shaw for Brazzers'],
|
['caucasian', 1, 'Sheena Shaw for Brazzers'],
|
||||||
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'],
|
||||||
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'],
|
||||||
|
|
112
src/entities.js
112
src/entities.js
|
@ -14,16 +14,10 @@ function curateEntity(entity, includeParameters = false) {
|
||||||
url: entity.url,
|
url: entity.url,
|
||||||
description: entity.description,
|
description: entity.description,
|
||||||
slug: entity.slug,
|
slug: entity.slug,
|
||||||
independent: !!entity.parameters && entity.parameters.independent,
|
type: entity.type,
|
||||||
parameters: includeParameters ? entity.parameters : null,
|
parameters: includeParameters ? entity.parameters : null,
|
||||||
network: {
|
parent: entity.parent,
|
||||||
id: entity.network_id,
|
children: (entity.children || []).map(child => curateEntity(child)),
|
||||||
name: entity.network_name,
|
|
||||||
description: entity.network_description,
|
|
||||||
slug: entity.network_slug,
|
|
||||||
url: entity.network_url,
|
|
||||||
parameters: includeParameters ? entity.network_parameters : null,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return curatedEntity;
|
return curatedEntity;
|
||||||
|
@ -33,56 +27,59 @@ async function curateEntities(entities, includeParameters) {
|
||||||
return Promise.all(entities.map(async entity => curateEntity(entity, includeParameters)));
|
return Promise.all(entities.map(async entity => curateEntity(entity, includeParameters)));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function findSiteByUrl(url) {
|
async function fetchSitesFromArgv() {
|
||||||
const { origin, hostname, pathname } = new URL(url);
|
|
||||||
// const domain = hostname.replace(/www.|tour./, '');
|
|
||||||
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
|
|
||||||
|
|
||||||
const site = await knex('sites')
|
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
|
||||||
.select(
|
|
||||||
'sites.*',
|
|
||||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
|
||||||
)
|
|
||||||
.where('sites.url', url)
|
|
||||||
.orWhere('sites.url', origin)
|
|
||||||
.orWhere('sites.url', origin.replace(/www\.|tour\./, ''))
|
|
||||||
.orWhere('sites.url', `https://www.${hostname}`)
|
|
||||||
.orWhere('sites.url', `http://www.${hostname}`)
|
|
||||||
.orWhere('sites.url', dirUrl)
|
|
||||||
// .orWhere('sites.url', 'like', `%${domain}`)
|
|
||||||
.first();
|
|
||||||
|
|
||||||
if (site) {
|
|
||||||
const curatedSite = curateSite(site, true, false);
|
|
||||||
|
|
||||||
return curatedSite;
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchEntitiesFromArgv() {
|
|
||||||
const rawEntities = await knex.raw(`
|
const rawEntities = await knex.raw(`
|
||||||
WITH RECURSIVE temp AS (
|
WITH RECURSIVE temp AS (
|
||||||
SELECT id, parent_id, name, slug, type FROM entities WHERE slug IN (?)
|
SELECT
|
||||||
UNION
|
id, parent_id, name, slug, type, url, description, parameters
|
||||||
SELECT entities.id, entities.parent_id, entities.name, entities.slug, entities.type FROM entities
|
FROM
|
||||||
INNER JOIN temp ON temp.id = entities.parent_id
|
entities
|
||||||
) SELECT * FROM temp;
|
WHERE
|
||||||
`, argv.sites || argv.networks || argv.entities);
|
slug = ANY(?) AND entities.type = 1
|
||||||
|
UNION ALL
|
||||||
console.log(rawEntities.rows);
|
SELECT
|
||||||
|
entities.id, entities.parent_id, entities.name, entities.slug, entities.type, entities.url, entities.description, entities.parameters
|
||||||
|
FROM
|
||||||
|
entities
|
||||||
|
INNER JOIN
|
||||||
|
temp ON temp.id = entities.parent_id
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
entities.*, row_to_json(parents) as parent, json_agg(temp) as children
|
||||||
|
FROM
|
||||||
|
temp
|
||||||
|
LEFT JOIN
|
||||||
|
entities ON entities.id = temp.parent_id
|
||||||
|
LEFT JOIN
|
||||||
|
entities AS parents ON parents.id = entities.parent_id
|
||||||
|
WHERE
|
||||||
|
temp.type = 2
|
||||||
|
GROUP BY
|
||||||
|
temp.parent_id, entities.id, entities.name, parents.id
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
entities.*, row_to_json(parents) as parent, json_build_array(row_to_json(children))
|
||||||
|
FROM
|
||||||
|
entities AS children
|
||||||
|
LEFT JOIN
|
||||||
|
entities ON entities.id = children.parent_id
|
||||||
|
LEFT JOIN
|
||||||
|
entities AS parents ON parents.id = entities.parent_id
|
||||||
|
WHERE
|
||||||
|
children.slug = ANY(?) AND children.type = 2
|
||||||
|
GROUP BY
|
||||||
|
entities.id, parents.id, children.id;
|
||||||
|
`, [argv.networks || [], argv.sites || []]);
|
||||||
|
|
||||||
const curatedEntities = await curateEntities(rawEntities.rows, true);
|
const curatedEntities = await curateEntities(rawEntities.rows, true);
|
||||||
logger.info(`Found ${curatedEntities.length} entities in database`);
|
logger.info(`Found ${curatedEntities.length} entities in database`);
|
||||||
|
|
||||||
console.log(curatedEntities);
|
console.log(rawEntities.rows);
|
||||||
|
|
||||||
return curatedEntities;
|
return curatedEntities;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchEntitiesFromConfig() {
|
async function fetchSitesFromConfig() {
|
||||||
const rawSites = await knex('entities')
|
const rawSites = await knex('entities')
|
||||||
.select('entities.*')
|
.select('entities.*')
|
||||||
.leftJoin('entities as entities_parents', 'entities_parents.id', 'entities.id')
|
.leftJoin('entities as entities_parents', 'entities_parents.id', 'entities.id')
|
||||||
|
@ -95,18 +92,18 @@ async function fetchEntitiesFromConfig() {
|
||||||
builder.whereIn('entities.slug', config.exclude || []);
|
builder.whereIn('entities.slug', config.exclude || []);
|
||||||
});
|
});
|
||||||
|
|
||||||
const curatedSites = await curateSites(rawSites, true);
|
const curatedSites = await curateEntities(rawSites, true);
|
||||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
logger.info(`Found ${curatedSites.length} entities in database`);
|
||||||
|
|
||||||
return curatedSites;
|
return curatedSites;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchIncludedEntities() {
|
async function fetchIncludedEntities() {
|
||||||
if (argv.networks || argv.sites) {
|
if (argv.networks || argv.sites) {
|
||||||
return fetchEntitiesFromArgv();
|
return fetchSitesFromArgv();
|
||||||
}
|
}
|
||||||
|
|
||||||
return fetchEntitiesFromConfig();
|
return fetchSitesFromConfig();
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchSites(queryObject) {
|
async function fetchSites(queryObject) {
|
||||||
|
@ -119,7 +116,7 @@ async function fetchSites(queryObject) {
|
||||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||||
.limit(100);
|
.limit(100);
|
||||||
|
|
||||||
return curateSites(sites);
|
return curateEntities(sites);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchSitesFromReleases() {
|
async function fetchSitesFromReleases() {
|
||||||
|
@ -129,7 +126,7 @@ async function fetchSitesFromReleases() {
|
||||||
.groupBy('sites.id')
|
.groupBy('sites.id')
|
||||||
.limit(100);
|
.limit(100);
|
||||||
|
|
||||||
return curateSites(sites);
|
return curateEntities(sites);
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
@ -137,8 +134,7 @@ module.exports = {
|
||||||
curateEntities,
|
curateEntities,
|
||||||
fetchIncludedEntities,
|
fetchIncludedEntities,
|
||||||
fetchSites,
|
fetchSites,
|
||||||
fetchEntitiesFromConfig,
|
fetchSitesFromConfig,
|
||||||
fetchEntitiesFromArgv,
|
fetchSitesFromArgv,
|
||||||
fetchSitesFromReleases,
|
fetchSitesFromReleases,
|
||||||
findSiteByUrl,
|
|
||||||
};
|
};
|
||||||
|
|
|
@ -108,7 +108,7 @@ function scrapeAllT1(scenes, site, accSiteReleases) {
|
||||||
release.entryId = deriveEntryId(release);
|
release.entryId = deriveEntryId(release);
|
||||||
|
|
||||||
if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
|
if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
|
||||||
// filter out releases that were already scraped from a categorized site
|
// filter out releases that were already scraped from a categorized site, requires sequential site scraping
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ const logger = require('./logger')(__filename);
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
const scrapers = require('./scrapers/scrapers');
|
const scrapers = require('./scrapers/scrapers');
|
||||||
const { fetchEntitiesFromArgv, fetchEntitiesFromConfig } = require('./entities');
|
const { fetchSitesFromArgv, fetchSitesFromConfig } = require('./entities');
|
||||||
|
|
||||||
const afterDate = (() => {
|
const afterDate = (() => {
|
||||||
if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
|
if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
|
||||||
|
@ -219,8 +219,10 @@ async function scrapeNetworkParallel(network) {
|
||||||
|
|
||||||
async function fetchUpdates() {
|
async function fetchUpdates() {
|
||||||
const includedNetworks = argv.sites || argv.networks || argv.from
|
const includedNetworks = argv.sites || argv.networks || argv.from
|
||||||
? await fetchEntitiesFromArgv()
|
? await fetchSitesFromArgv()
|
||||||
: await fetchEntitiesFromConfig();
|
: await fetchSitesFromConfig();
|
||||||
|
|
||||||
|
// console.log('included', includedNetworks);
|
||||||
|
|
||||||
const scrapedNetworks = await Promise.map(
|
const scrapedNetworks = await Promise.map(
|
||||||
includedNetworks,
|
includedNetworks,
|
||||||
|
|
Loading…
Reference in New Issue