Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.
This commit is contained in:
272
src/sites.js
272
src/sites.js
@@ -8,189 +8,189 @@ const knex = require('./knex');
|
||||
const whereOr = require('./utils/where-or');
|
||||
|
||||
async function curateSite(site, includeParameters = false, includeTags = true) {
|
||||
const curatedSite = {
|
||||
id: site.id,
|
||||
name: site.name,
|
||||
url: site.url,
|
||||
description: site.description,
|
||||
slug: site.slug,
|
||||
independent: !!site.parameters && site.parameters.independent,
|
||||
parameters: includeParameters ? site.parameters : null,
|
||||
network: {
|
||||
id: site.network_id,
|
||||
name: site.network_name,
|
||||
description: site.network_description,
|
||||
slug: site.network_slug,
|
||||
url: site.network_url,
|
||||
parameters: includeParameters ? site.network_parameters : null,
|
||||
},
|
||||
};
|
||||
const curatedSite = {
|
||||
id: site.id,
|
||||
name: site.name,
|
||||
url: site.url,
|
||||
description: site.description,
|
||||
slug: site.slug,
|
||||
independent: !!site.parameters && site.parameters.independent,
|
||||
parameters: includeParameters ? site.parameters : null,
|
||||
network: {
|
||||
id: site.network_id,
|
||||
name: site.network_name,
|
||||
description: site.network_description,
|
||||
slug: site.network_slug,
|
||||
url: site.network_url,
|
||||
parameters: includeParameters ? site.network_parameters : null,
|
||||
},
|
||||
};
|
||||
|
||||
if (includeTags) {
|
||||
curatedSite.tags = await knex('sites_tags')
|
||||
.select('tags.*', 'sites_tags.inherit')
|
||||
.where('site_id', site.id)
|
||||
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
||||
}
|
||||
if (includeTags) {
|
||||
curatedSite.tags = await knex('sites_tags')
|
||||
.select('tags.*', 'sites_tags.inherit')
|
||||
.where('site_id', site.id)
|
||||
.join('tags', 'tags.id', 'sites_tags.tag_id');
|
||||
}
|
||||
|
||||
return curatedSite;
|
||||
return curatedSite;
|
||||
}
|
||||
|
||||
async function curateSites(sites, includeParameters) {
|
||||
return Promise.all(sites.map(async site => curateSite(site, includeParameters)));
|
||||
return Promise.all(sites.map(async site => curateSite(site, includeParameters)));
|
||||
}
|
||||
|
||||
function destructConfigNetworks(networks = []) {
|
||||
return networks.reduce((acc, network) => {
|
||||
if (Array.isArray(network)) {
|
||||
// network specifies sites
|
||||
return {
|
||||
...acc,
|
||||
sites: [...acc.sites, ...network[1]],
|
||||
};
|
||||
}
|
||||
return networks.reduce((acc, network) => {
|
||||
if (Array.isArray(network)) {
|
||||
// network specifies sites
|
||||
return {
|
||||
...acc,
|
||||
sites: [...acc.sites, ...network[1]],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
...acc,
|
||||
networks: [...acc.networks, network],
|
||||
};
|
||||
}, {
|
||||
networks: [],
|
||||
sites: [],
|
||||
});
|
||||
return {
|
||||
...acc,
|
||||
networks: [...acc.networks, network],
|
||||
};
|
||||
}, {
|
||||
networks: [],
|
||||
sites: [],
|
||||
});
|
||||
}
|
||||
|
||||
async function findSiteByUrl(url) {
|
||||
const { origin, hostname, pathname } = new URL(url);
|
||||
// const domain = hostname.replace(/www.|tour./, '');
|
||||
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
|
||||
const { origin, hostname, pathname } = new URL(url);
|
||||
// const domain = hostname.replace(/www.|tour./, '');
|
||||
const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
|
||||
|
||||
const site = await knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.where('sites.url', url)
|
||||
.orWhere('sites.url', origin)
|
||||
.orWhere('sites.url', origin.replace(/www\.|tour\./, ''))
|
||||
.orWhere('sites.url', `https://www.${hostname}`)
|
||||
.orWhere('sites.url', `http://www.${hostname}`)
|
||||
.orWhere('sites.url', dirUrl)
|
||||
// .orWhere('sites.url', 'like', `%${domain}`)
|
||||
.first();
|
||||
const site = await knex('sites')
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.where('sites.url', url)
|
||||
.orWhere('sites.url', origin)
|
||||
.orWhere('sites.url', origin.replace(/www\.|tour\./, ''))
|
||||
.orWhere('sites.url', `https://www.${hostname}`)
|
||||
.orWhere('sites.url', `http://www.${hostname}`)
|
||||
.orWhere('sites.url', dirUrl)
|
||||
// .orWhere('sites.url', 'like', `%${domain}`)
|
||||
.first();
|
||||
|
||||
if (site) {
|
||||
const curatedSite = curateSite(site, true, false);
|
||||
if (site) {
|
||||
const curatedSite = curateSite(site, true, false);
|
||||
|
||||
return curatedSite;
|
||||
}
|
||||
return curatedSite;
|
||||
}
|
||||
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
|
||||
function sitesByNetwork(sites) {
|
||||
const networks = sites.reduce((acc, site) => {
|
||||
if (acc[site.network.slug]) {
|
||||
acc[site.network.slug].sites = acc[site.network.slug].sites.concat(site);
|
||||
const networks = sites.reduce((acc, site) => {
|
||||
if (acc[site.network.slug]) {
|
||||
acc[site.network.slug].sites = acc[site.network.slug].sites.concat(site);
|
||||
|
||||
return acc;
|
||||
}
|
||||
return acc;
|
||||
}
|
||||
|
||||
acc[site.network.slug] = {
|
||||
...site.network,
|
||||
sites: [site],
|
||||
};
|
||||
acc[site.network.slug] = {
|
||||
...site.network,
|
||||
sites: [site],
|
||||
};
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
return Object.values(networks);
|
||||
return Object.values(networks);
|
||||
}
|
||||
|
||||
async function fetchSitesFromArgv() {
|
||||
const rawSites = await knex('sites')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.whereIn('sites.slug', argv.sites || [])
|
||||
.orWhereIn('networks.slug', argv.networks || [])
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
||||
const rawSites = await knex('sites')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.whereIn('sites.slug', argv.sites || [])
|
||||
.orWhereIn('networks.slug', argv.networks || [])
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id');
|
||||
|
||||
const curatedSites = await curateSites(rawSites, true);
|
||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
||||
const curatedSites = await curateSites(rawSites, true);
|
||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
||||
|
||||
return sitesByNetwork(curatedSites);
|
||||
return sitesByNetwork(curatedSites);
|
||||
}
|
||||
|
||||
async function fetchSitesFromConfig() {
|
||||
const included = destructConfigNetworks(config.include);
|
||||
const excluded = destructConfigNetworks(config.exclude);
|
||||
const included = destructConfigNetworks(config.include);
|
||||
const excluded = destructConfigNetworks(config.exclude);
|
||||
|
||||
const rawSites = await knex('sites')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.where((builder) => {
|
||||
if (config.include) {
|
||||
builder
|
||||
.whereIn('sites.slug', included.sites)
|
||||
.orWhereIn('networks.slug', included.networks);
|
||||
}
|
||||
})
|
||||
.whereNot((builder) => {
|
||||
builder
|
||||
.whereIn('sites.slug', excluded.sites)
|
||||
.orWhereIn('networks.slug', excluded.networks);
|
||||
});
|
||||
const rawSites = await knex('sites')
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.where((builder) => {
|
||||
if (config.include) {
|
||||
builder
|
||||
.whereIn('sites.slug', included.sites)
|
||||
.orWhereIn('networks.slug', included.networks);
|
||||
}
|
||||
})
|
||||
.whereNot((builder) => {
|
||||
builder
|
||||
.whereIn('sites.slug', excluded.sites)
|
||||
.orWhereIn('networks.slug', excluded.networks);
|
||||
});
|
||||
|
||||
const curatedSites = await curateSites(rawSites, true);
|
||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
||||
const curatedSites = await curateSites(rawSites, true);
|
||||
logger.info(`Found ${curatedSites.length} sites in database`);
|
||||
|
||||
return sitesByNetwork(curatedSites);
|
||||
return sitesByNetwork(curatedSites);
|
||||
}
|
||||
|
||||
async function fetchIncludedSites() {
|
||||
if (argv.networks || argv.sites) {
|
||||
return fetchSitesFromArgv();
|
||||
}
|
||||
if (argv.networks || argv.sites) {
|
||||
return fetchSitesFromArgv();
|
||||
}
|
||||
|
||||
return fetchSitesFromConfig();
|
||||
return fetchSitesFromConfig();
|
||||
}
|
||||
|
||||
async function fetchSites(queryObject) {
|
||||
const sites = await knex('sites')
|
||||
.where(builder => whereOr(queryObject, 'sites', builder))
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.limit(100);
|
||||
const sites = await knex('sites')
|
||||
.where(builder => whereOr(queryObject, 'sites', builder))
|
||||
.select(
|
||||
'sites.*',
|
||||
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
|
||||
)
|
||||
.leftJoin('networks', 'sites.network_id', 'networks.id')
|
||||
.limit(100);
|
||||
|
||||
return curateSites(sites);
|
||||
return curateSites(sites);
|
||||
}
|
||||
|
||||
async function fetchSitesFromReleases() {
|
||||
const sites = await knex('releases')
|
||||
.select('site_id', '')
|
||||
.leftJoin('sites', 'sites.id', 'releases.site_id')
|
||||
.groupBy('sites.id')
|
||||
.limit(100);
|
||||
const sites = await knex('releases')
|
||||
.select('site_id', '')
|
||||
.leftJoin('sites', 'sites.id', 'releases.site_id')
|
||||
.groupBy('sites.id')
|
||||
.limit(100);
|
||||
|
||||
return curateSites(sites);
|
||||
return curateSites(sites);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
curateSite,
|
||||
curateSites,
|
||||
fetchIncludedSites,
|
||||
fetchSites,
|
||||
fetchSitesFromConfig,
|
||||
fetchSitesFromArgv,
|
||||
fetchSitesFromReleases,
|
||||
findSiteByUrl,
|
||||
curateSite,
|
||||
curateSites,
|
||||
fetchIncludedSites,
|
||||
fetchSites,
|
||||
fetchSitesFromConfig,
|
||||
fetchSitesFromArgv,
|
||||
fetchSitesFromReleases,
|
||||
findSiteByUrl,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user