forked from DebaucheryLibrarian/traxxx
Fixed and documented entity configuration and query.
This commit is contained in:
parent
59e2124407
commit
77566eae0d
32
README.md
32
README.md
|
@ -18,6 +18,38 @@ Do not modify `config/default.js`, but instead create a copy at `config/local.js
|
||||||
|
|
||||||
You can also use `npm run flush` to run both steps at once, and wipe the database completely later.
|
You can also use `npm run flush` to run both steps at once, and wipe the database completely later.
|
||||||
|
|
||||||
|
#### Networks and channels
|
||||||
|
To scrape the networks and channels available in the database, you can configure `include` and `exclude` lists. To include all available channels and only use the `exclude` list, leave the `include` parameter unconfigured. The `exclude` lists will exclude channels and child networks from networks on the `include` lists, but not vice versa. That is, if the `include` list includes a network and the `exclude` list excludes one of that network's channels, the channel will not be scraped. However, if the `include` list includes a channel, and the `exclude` list includes its parent network, the channel will be scraped.
|
||||||
|
|
||||||
|
This configuration will scrape Evil Angel and all XEmpire channels, except for LesbianX.
|
||||||
|
```
|
||||||
|
include: {
|
||||||
|
networks: [
|
||||||
|
'xempire',
|
||||||
|
],
|
||||||
|
channels: [
|
||||||
|
'evilangel',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
exclude: {
|
||||||
|
channels: [
|
||||||
|
'lesbianx',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This configuration will scrape all channels, except for BAM Visions, and except all channels part of the Vixen network.
|
||||||
|
```
|
||||||
|
exclude: {
|
||||||
|
channels: [
|
||||||
|
'bamvisions',
|
||||||
|
],
|
||||||
|
networks: [
|
||||||
|
'vixen'
|
||||||
|
],
|
||||||
|
},
|
||||||
|
```
|
||||||
|
|
||||||
### Building
|
### Building
|
||||||
To build traxxx, run the following command:
|
To build traxxx, run the following command:
|
||||||
|
|
||||||
|
|
|
@ -11,19 +11,7 @@ module.exports = {
|
||||||
sfwHost: '0.0.0.0',
|
sfwHost: '0.0.0.0',
|
||||||
sfwPort: 5001,
|
sfwPort: 5001,
|
||||||
},
|
},
|
||||||
include: {
|
|
||||||
networks: [
|
|
||||||
'xempire',
|
|
||||||
'julesjordan',
|
|
||||||
],
|
|
||||||
channels: [],
|
|
||||||
},
|
|
||||||
exclude: {
|
exclude: {
|
||||||
networks: [
|
|
||||||
'hardx',
|
|
||||||
'pornpros',
|
|
||||||
'mindgeek',
|
|
||||||
],
|
|
||||||
channels: [
|
channels: [
|
||||||
// 21sextreme, no longer updated
|
// 21sextreme, no longer updated
|
||||||
'mightymistress',
|
'mightymistress',
|
||||||
|
|
150
src/entities.js
150
src/entities.js
|
@ -1,6 +1,5 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const util = require('util');
|
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
|
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
|
@ -9,7 +8,11 @@ const knex = require('./knex');
|
||||||
const whereOr = require('./utils/where-or');
|
const whereOr = require('./utils/where-or');
|
||||||
|
|
||||||
function curateEntity(entity, includeParameters = false) {
|
function curateEntity(entity, includeParameters = false) {
|
||||||
const curatedEntity = {
|
if (!entity) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const curatedEntity = entity.id ? {
|
||||||
id: entity.id,
|
id: entity.id,
|
||||||
name: entity.name,
|
name: entity.name,
|
||||||
url: entity.url,
|
url: entity.url,
|
||||||
|
@ -17,12 +20,15 @@ function curateEntity(entity, includeParameters = false) {
|
||||||
slug: entity.slug,
|
slug: entity.slug,
|
||||||
type: entity.type,
|
type: entity.type,
|
||||||
parameters: includeParameters ? entity.parameters : null,
|
parameters: includeParameters ? entity.parameters : null,
|
||||||
parent: entity.parent_id && entity.parent,
|
parent: curateEntity(entity.parent),
|
||||||
children: (entity.children || []).map(child => curateEntity({
|
} : {};
|
||||||
|
|
||||||
|
if (entity.children) {
|
||||||
|
curatedEntity.children = entity.children.map(child => curateEntity({
|
||||||
...child,
|
...child,
|
||||||
parent: entity,
|
parent: curatedEntity.id ? curatedEntity : null,
|
||||||
}, includeParameters)),
|
}, includeParameters));
|
||||||
};
|
}
|
||||||
|
|
||||||
return curatedEntity;
|
return curatedEntity;
|
||||||
}
|
}
|
||||||
|
@ -36,14 +42,14 @@ async function fetchChannelsFromArgv() {
|
||||||
/* networks from argument with channels as children */
|
/* networks from argument with channels as children */
|
||||||
WITH RECURSIVE children AS (
|
WITH RECURSIVE children AS (
|
||||||
SELECT
|
SELECT
|
||||||
id, parent_id, name, slug, type, url, description, parameters
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
WHERE
|
WHERE
|
||||||
slug = ANY(?) AND entities.type = 'network'
|
slug = ANY(?) AND entities.type = 'network'
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT
|
SELECT
|
||||||
entities.id, entities.parent_id, entities.name, entities.slug, entities.type, entities.url, entities.description, entities.parameters
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
INNER JOIN
|
INNER JOIN
|
||||||
|
@ -86,78 +92,44 @@ async function fetchChannelsFromArgv() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchChannelsFromConfig() {
|
async function fetchChannelsFromConfig() {
|
||||||
console.log(config.include);
|
|
||||||
|
|
||||||
/*
|
|
||||||
const rawNetworks = await knex.raw(`
|
const rawNetworks = await knex.raw(`
|
||||||
WITH RECURSIVE children AS (
|
|
||||||
SELECT
|
|
||||||
id, parent_id, name, slug, type, url, description, parameters
|
|
||||||
FROM
|
|
||||||
entities
|
|
||||||
WHERE
|
|
||||||
CASE WHEN array_length(?, 1) IS NOT NULL
|
|
||||||
THEN slug = ANY(?)
|
|
||||||
ELSE true
|
|
||||||
END
|
|
||||||
AND NOT
|
|
||||||
slug = ANY(?)
|
|
||||||
AND
|
|
||||||
entities.type = 'network'
|
|
||||||
UNION ALL
|
|
||||||
SELECT
|
|
||||||
entities.id, entities.parent_id, entities.name, entities.slug, entities.type, entities.url, entities.description, entities.parameters
|
|
||||||
FROM
|
|
||||||
entities
|
|
||||||
INNER JOIN
|
|
||||||
children ON children.id = entities.parent_id
|
|
||||||
)
|
|
||||||
SELECT
|
|
||||||
entities.*, row_to_json(parents) as parent, json_agg(children) as children
|
|
||||||
FROM
|
|
||||||
children
|
|
||||||
LEFT JOIN
|
|
||||||
entities ON entities.id = children.parent_id
|
|
||||||
LEFT JOIN
|
|
||||||
entities AS parents ON parents.id = entities.parent_id
|
|
||||||
WHERE
|
|
||||||
children.type = 'channel'
|
|
||||||
GROUP BY
|
|
||||||
children.parent_id, entities.id, entities.name, parents.id
|
|
||||||
`, [
|
|
||||||
config.include.networks,
|
|
||||||
config.include.networks,
|
|
||||||
config.exclude.networks,
|
|
||||||
]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
const rawNetworks = await knex.raw(`
|
|
||||||
/* select channels associated to configured networks */
|
|
||||||
WITH RECURSIVE channels AS (
|
WITH RECURSIVE channels AS (
|
||||||
/* select configured networks */
|
/* select configured channels and networks */
|
||||||
SELECT
|
SELECT
|
||||||
id, parent_id, name, type, slug
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
WHERE
|
WHERE
|
||||||
(slug = ANY(:includeNetworks)
|
CASE WHEN :includeAll
|
||||||
AND NOT entities.slug = ANY(:excludedNetworks))
|
THEN
|
||||||
AND entities.type = 'network'
|
/* select all top level networks and independent channels */
|
||||||
|
entities.parent_id IS NULL
|
||||||
|
ELSE
|
||||||
|
((entities.slug = ANY(:includedNetworks)
|
||||||
|
AND entities.type = 'network')
|
||||||
|
OR (entities.slug = ANY(:includedChannels)
|
||||||
|
AND entities.type = 'channel'))
|
||||||
|
END
|
||||||
|
AND NOT (
|
||||||
|
(entities.slug = ANY(:excludedNetworks)
|
||||||
|
AND entities.type = 'network')
|
||||||
|
OR (entities.slug = ANY(:excludedChannels)
|
||||||
|
AND entities.type = 'channel'))
|
||||||
|
|
||||||
UNION ALL
|
UNION ALL
|
||||||
|
|
||||||
/* select recursive children of configured networks */
|
/* select recursive children of configured networks */
|
||||||
SELECT
|
SELECT
|
||||||
entities.id, entities.parent_id, entities.name, entities.type, entities.slug
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
INNER JOIN
|
INNER JOIN
|
||||||
channels ON channels.id = entities.parent_id
|
channels ON channels.id = entities.parent_id
|
||||||
WHERE
|
WHERE
|
||||||
NOT (
|
NOT ((entities.slug = ANY(:excludedNetworks)
|
||||||
(entities.slug = ANY(:excludedNetworks) AND entities.type = 'network')
|
AND entities.type = 'network')
|
||||||
OR (entities.slug = ANY(:excludedChannels) AND entities.type = 'channel')
|
OR (entities.slug = ANY(:excludedChannels)
|
||||||
)
|
AND entities.type = 'channel'))
|
||||||
)
|
)
|
||||||
/* select recursive channels as children of networks */
|
/* select recursive channels as children of networks */
|
||||||
SELECT
|
SELECT
|
||||||
|
@ -170,51 +142,17 @@ async function fetchChannelsFromConfig() {
|
||||||
channels.type = 'channel'
|
channels.type = 'channel'
|
||||||
GROUP BY
|
GROUP BY
|
||||||
entities.id
|
entities.id
|
||||||
|
|
||||||
UNION ALL
|
|
||||||
|
|
||||||
/* select configured channels as children of networks */
|
|
||||||
SELECT
|
|
||||||
entities.*, json_agg(children) as children
|
|
||||||
FROM
|
|
||||||
entities AS children
|
|
||||||
LEFT JOIN
|
|
||||||
entities ON entities.id = children.parent_id
|
|
||||||
WHERE
|
|
||||||
children.slug = ANY(:includedChannels)
|
|
||||||
AND
|
|
||||||
children.type = 'channel'
|
|
||||||
GROUP BY
|
|
||||||
entities.id
|
|
||||||
`, {
|
`, {
|
||||||
includedNetworks: config.include.networks,
|
includeAll: !config.include?.networks && !config.include?.channels,
|
||||||
includedChannels: config.include.channels,
|
includedNetworks: config.include?.networks || [],
|
||||||
excludedNetworks: config.exclude.networks,
|
includedChannels: config.include?.channels || [],
|
||||||
excludedChannels: config.exclude.channels,
|
excludedNetworks: config.exclude?.networks || [],
|
||||||
|
excludedChannels: config.exclude?.channels || [],
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(util.inspect(rawNetworks.rows, null, null));
|
const curatedNetworks = rawNetworks.rows.map(entity => curateEntity(entity, true));
|
||||||
|
|
||||||
/*
|
return curatedNetworks;
|
||||||
const curatedSites = await curateEntities(rawChannels, true);
|
|
||||||
logger.info(`Found ${curatedSites.length} entities in database`);
|
|
||||||
|
|
||||||
const rawChannels = await knex('entities')
|
|
||||||
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
|
||||||
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
|
||||||
.where((builder) => {
|
|
||||||
if (config.include) {
|
|
||||||
builder.whereIn('entities.slug', config.include);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.whereNot((builder) => {
|
|
||||||
builder.whereIn('entities.slug', config.exclude || []);
|
|
||||||
});
|
|
||||||
|
|
||||||
console.log(rawChannels);
|
|
||||||
*/
|
|
||||||
|
|
||||||
// return curatedSites;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchIncludedEntities() {
|
async function fetchIncludedEntities() {
|
||||||
|
|
Loading…
Reference in New Issue