Experimentally scraping data from HardX.

This commit is contained in:
ThePendulum 2019-03-03 04:18:33 +01:00
parent cf8f299061
commit 71aa31dda6
3 changed files with 1771 additions and 67 deletions

1724
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,9 @@
"description": "All the latest porn releases in one place", "description": "All the latest porn releases in one place",
"main": "src/app.js", "main": "src/app.js",
"scripts": { "scripts": {
"test": "echo \"Error: no test specified\" && exit 1" "start": "node src/app.js",
"eslint": "eslint src/",
"eslint-watch": "esw --watch src/"
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@ -22,9 +24,18 @@
"@babel/cli": "^7.2.3", "@babel/cli": "^7.2.3",
"@babel/core": "^7.3.4", "@babel/core": "^7.3.4",
"@babel/preset-env": "^7.3.4", "@babel/preset-env": "^7.3.4",
"babel-preset-airbnb": "^3.2.0" "babel-eslint": "^10.0.1",
"babel-preset-airbnb": "^3.2.0",
"eslint": "^5.15.0",
"eslint-config-airbnb-base": "^13.1.0",
"eslint-plugin-import": "^2.16.0",
"eslint-watch": "^4.0.2"
}, },
"dependencies": { "dependencies": {
"config": "^3.0.1" "bhttp": "^1.2.4",
"cheerio": "^1.0.0-rc.2",
"config": "^3.0.1",
"date-fns": "^1.30.1",
"terminal-kit": "^1.27.0"
} }
} }

45
src/app.js Normal file
View File

@ -0,0 +1,45 @@
'use strict';
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const { parse, format } = require('date-fns');
async function init() {
const baseUrl = 'https://www.hardx.com';
const res = await bhttp.get(`${baseUrl}/en/videos`, {});
const $ = cheerio.load(res.body.toString(), { normalizeWhitespace: true });
const scenesElements = $('.sceneInfo').toArray();
const scenes = scenesElements.map((element) => {
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${baseUrl}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title');
const [likes, dislikes] = $(element).find('.value')
.toArray()
.map(value => Number($(value).text()));
const stars = Math.floor(((likes * 5 + dislikes) / (likes + dislikes)) * 100) / 100;
const actors = $(element).find('.sceneActors a')
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
const date = parse($(element).find('.sceneDate').text(), 'MM-DD-YYYY');
return {
url,
title,
actors,
date,
rating: {
likes,
dislikes,
stars,
},
};
});
console.log(scenes);
}
init();