256 lines
11 KiB
JavaScript
256 lines
11 KiB
JavaScript
|
const fs = require('fs/promises');
|
||
|
const path = require('path');
|
||
|
|
||
|
const jsdom = require('jsdom');
|
||
|
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
|
||
|
|
||
|
const ConcurrentQueue = require('./concurrent-queue.js');
|
||
|
|
||
|
class ScrapeError extends Error {
|
||
|
constructor(message, options, fileName, lineNumber) {
|
||
|
super(...arguments);
|
||
|
this.name = 'ScrapeError';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// From Cordis util.js
|
||
|
function sanitizeFileName(name) {
|
||
|
// Windows Version (created for Windows, most likely works cross-platform too given my research)
|
||
|
// Allowed Characters: Extended Unicode Charset (1-255)
|
||
|
// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
|
||
|
// Reserved Characters: <>:"/\|?*
|
||
|
// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names
|
||
|
|
||
|
// Illegal File Names (Windows)
|
||
|
if ([ 'CON', 'PRN', 'AUX', 'NUL',
|
||
|
'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
|
||
|
'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
|
||
|
name += '_';
|
||
|
}
|
||
|
// Reserved Characters
|
||
|
name = name.replace(/[<>:\"\/\\|?*]/g, '');
|
||
|
// Allowed Characters
|
||
|
return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');
|
||
|
|
||
|
// Much stricter whitelist version
|
||
|
// replace bad characters with '_'
|
||
|
//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
|
||
|
}
|
||
|
|
||
|
function estMSRemaining(startTime, ratioComplete) {
|
||
|
return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
|
||
|
}
|
||
|
|
||
|
function formatRelative(msRelative) {
|
||
|
if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
|
||
|
else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
|
||
|
else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
|
||
|
else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
|
||
|
}
|
||
|
|
||
|
async function sleep(ms) {
|
||
|
return new Promise((resolve) => {
|
||
|
setTimeout(resolve, ms);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
async function fuzzyDelay() {
|
||
|
await sleep(500 + (500 * Math.random()));
|
||
|
}
|
||
|
|
||
|
async function saveJsonData(filename, dataJSON) {
|
||
|
await fs.writeFile(filename, JSON.stringify(dataJSON));
|
||
|
}
|
||
|
|
||
|
// Note: This is the key scraper function. It scrapes the .js-store's data
|
||
|
async function scrapeUGDataContent(url) {
|
||
|
let page = null;
|
||
|
try {
|
||
|
page = await fetch(url);
|
||
|
} catch (e) {
|
||
|
throw new ScrapeError('Unable to fetch url', { cause: e });
|
||
|
}
|
||
|
|
||
|
let text = null;
|
||
|
try {
|
||
|
text = await page.text();
|
||
|
} catch (e) {
|
||
|
throw new ScrapeError('Unable to decode page', { cause: e });
|
||
|
}
|
||
|
|
||
|
let dom = null;
|
||
|
try {
|
||
|
dom = new jsdom.JSDOM(text);
|
||
|
} catch (e) {
|
||
|
throw new ScrapeError('Unable to parse document', { cause: e });
|
||
|
}
|
||
|
|
||
|
if (!dom.window || !dom.window.document) {
|
||
|
throw new ScrapeError('Unable to parse document');
|
||
|
}
|
||
|
|
||
|
let document = dom.window.document;
|
||
|
|
||
|
const jsStore = document.querySelector('.js-store');
|
||
|
if (jsStore == null) {
|
||
|
throw new ScrapeError('Unable to find .js-store element for ' + url);
|
||
|
}
|
||
|
|
||
|
const contentJSON = jsStore.getAttribute('data-content');
|
||
|
if (contentJSON == null) {
|
||
|
throw new ScrapeError('Unable to find data-content attribute on .js-store');
|
||
|
}
|
||
|
|
||
|
const content = JSON.parse(contentJSON);
|
||
|
return content;
|
||
|
}
|
||
|
|
||
|
function parseTab(ugDataContent) {
|
||
|
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
|
||
|
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
|
||
|
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
|
||
|
const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
|
||
|
const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
|
||
|
const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
|
||
|
const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
|
||
|
return { meta, text };
|
||
|
}
|
||
|
|
||
|
function parseBandsPage(ugDataContent) {
|
||
|
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
|
||
|
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
|
||
|
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
|
||
|
const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');
|
||
|
const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');
|
||
|
const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');
|
||
|
const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');
|
||
|
return { alpha, artists, pagenum, pagecnt };
|
||
|
}
|
||
|
|
||
|
function parseArtistPage(ugDataContent) {
|
||
|
const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
|
||
|
const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
|
||
|
const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
|
||
|
const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');
|
||
|
const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');
|
||
|
const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');
|
||
|
|
||
|
const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');
|
||
|
const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');
|
||
|
const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');
|
||
|
const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');
|
||
|
|
||
|
return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };
|
||
|
}
|
||
|
|
||
|
// Returns a list of tab metadata (including tab URL)
|
||
|
async function scrapeAllArtistTabListPages(startURL) {
|
||
|
let tabs = [];
|
||
|
let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
|
||
|
while (true) {
|
||
|
//console.log('scraping artist page: ' + url.toString());
|
||
|
const ugDataContent = await scrapeUGDataContent(url.toString());
|
||
|
const page = parseArtistPage(ugDataContent);
|
||
|
|
||
|
tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);
|
||
|
|
||
|
const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);
|
||
|
if (nextPageData == null) break;
|
||
|
url = new URL(nextPageData.url, url);
|
||
|
|
||
|
await fuzzyDelay();
|
||
|
}
|
||
|
|
||
|
// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
|
||
|
const uniqueTabIds = new Set();
|
||
|
const uniqueTabs = [];
|
||
|
for (let tab of tabs) {
|
||
|
if (uniqueTabIds.has(tab.id)) continue;
|
||
|
uniqueTabIds.add(tab.id);
|
||
|
uniqueTabs.push(tab);
|
||
|
}
|
||
|
|
||
|
return uniqueTabs;
|
||
|
}
|
||
|
|
||
|
// Returns a list of artist metadata (including artist tab list URL)
|
||
|
async function scrapeAllBandListPages(startURL) {
|
||
|
let artists = [];
|
||
|
|
||
|
// https://www.ultimate-guitar.com/bands/d.htm
|
||
|
let url = new URL(startURL);
|
||
|
let startTime = Date.now();
|
||
|
|
||
|
while (true) {
|
||
|
const ugDataContent = await scrapeUGDataContent(url.toString());
|
||
|
const page = parseBandsPage(ugDataContent);
|
||
|
|
||
|
artists = artists.concat(page.artists);
|
||
|
|
||
|
let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));
|
||
|
console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);
|
||
|
|
||
|
if (page.pagenum + 1 > page.pagecnt) break;
|
||
|
url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...
|
||
|
|
||
|
await fuzzyDelay();
|
||
|
}
|
||
|
|
||
|
return artists;
|
||
|
}
|
||
|
|
||
|
async function saveBandList(filename, url) {
|
||
|
let artists = await scrapeAllBandListPages(url);
|
||
|
await saveJsonData(filename, artists);
|
||
|
}
|
||
|
|
||
|
// nigger :)
|
||
|
|
||
|
async function saveBandLists(urls) {
|
||
|
let startTime = Date.now();
|
||
|
let completed = 0;
|
||
|
for (const url of urls) {
|
||
|
console.log('doing band list: ' + url);
|
||
|
await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);
|
||
|
|
||
|
completed += 1;
|
||
|
let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));
|
||
|
console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);
|
||
|
|
||
|
await fuzzyDelay();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
|
||
|
// of the artist's tab metadatas (tab text is done in a different step)
|
||
|
async function saveArtistsWithTabMetadata(filename, artists) {
|
||
|
const baseURL = 'https://www.ultimate-guitar.com/';
|
||
|
|
||
|
let startTime = Date.now();
|
||
|
let completed = 0;
|
||
|
let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time
|
||
|
// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
|
||
|
for (let artist of artists) {
|
||
|
taskQueue.push(async () => {
|
||
|
let artistStartURL = new URL(artist.artist_url, baseURL);
|
||
|
let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);
|
||
|
artist.tabs = artistTabs;
|
||
|
|
||
|
completed += 1;
|
||
|
let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));
|
||
|
let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
|
||
|
let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
|
||
|
console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
await taskQueue.waitForDrain();
|
||
|
|
||
|
await saveJsonData(filename, artists);
|
||
|
}
|
||
|
|
||
|
module.exports = {
|
||
|
scrapeBands: saveBandLists,
|
||
|
scrapeArtistTabUrls: saveArtistsWithTabMetadata
|
||
|
};
|