tab-yoinker/01-scraper-urls/url-scraper.js

const fs = require('fs/promises');
const path = require('path');

const jsdom = require('jsdom');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));

const ConcurrentQueue = require('./concurrent-queue.js');

class ScrapeError extends Error {
	constructor(message, options, fileName, lineNumber) {
		super(...arguments);
		this.name = 'ScrapeError';
	}
}

// From Cordis util.js
function sanitizeFileName(name) {
	// Windows Version (created for Windows, most likely works cross-platform too given my research)
	// Allowed Characters: Extended Unicode Charset (1-255)
	// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
	// Reserved Characters: <>:"/\|?*
	// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names

	// Illegal File Names (Windows)
	if ([ 'CON', 'PRN', 'AUX', 'NUL',
		'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',
		'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?
			name += '_';
	}
	// Reserved Characters
	name = name.replace(/[<>:\"\/\\|?*]/g, '');
	// Allowed Characters
	return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');

	// Much stricter whitelist version
	// replace bad characters with '_'
	//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
}

function estMSRemaining(startTime, ratioComplete) {
	return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);
}

function formatRelative(msRelative) {
	if (msRelative < 1000)                return `${(msRelative).toFixed(2)}ms`;
	else if (msRelative < 60 * 1000)      return `${(msRelative / 1000).toFixed(2)}s`;
	else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
	else                                  return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
}

async function sleep(ms) {
	return new Promise((resolve) => {
		setTimeout(resolve, ms);
	});
}

async function fuzzyDelay() {
	await sleep(500 + (500 * Math.random()));
}

async function saveJsonData(filename, dataJSON) {
	await fs.writeFile(filename, JSON.stringify(dataJSON));
}

// Note: This is the key scraper function. It scrapes the .js-store's data
async function scrapeUGDataContent(url) {
	let page = null;
	try {
		page = await fetch(url);
	} catch (e) {
		throw new ScrapeError('Unable to fetch url', { cause: e });
	}

	let text = null;
	try {
		text = await page.text();
	} catch (e) {
		throw new ScrapeError('Unable to decode page', { cause: e });
	}

	let dom = null;
	try {
		dom = new jsdom.JSDOM(text);
	} catch (e) {
		throw new ScrapeError('Unable to parse document', { cause: e });
	}

	if (!dom.window || !dom.window.document) {
		throw new ScrapeError('Unable to parse document');
	}

	let document = dom.window.document;
	
	const jsStore = document.querySelector('.js-store');
	if (jsStore == null) {
		throw new ScrapeError('Unable to find .js-store element for ' + url);
	}

	const contentJSON = jsStore.getAttribute('data-content');
	if (contentJSON == null) {
		throw new ScrapeError('Unable to find data-content attribute on .js-store');
	}
	
	const content = JSON.parse(contentJSON);
	return content;
}

function parseTab(ugDataContent) {
	const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page  = store.page;          if (page  === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data  = page.data;           if (data  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const meta  = data.tab;            if (meta  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');
	const tview = data.tab_view;       if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');
	const wktab = tview.wiki_tab;      if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');
	const text  = wktab.content;       if (text  === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');
	return { meta, text };
}

function parseBandsPage(ugDataContent) {
	const store   = ugDataContent.store; if (store   === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page    = store.page;          if (page    === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data    = page.data;           if (data    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const alpha   = data.alpha;          if (alpha   === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');
	const artists = data.artists;        if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');
	const pagenum = data.current_page;   if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');
	const pagecnt = data.page_count;     if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');
	return { alpha, artists, pagenum, pagecnt };
}

function parseArtistPage(ugDataContent) {
	const store      = ugDataContent.store; if (store      === null) throw new ScrapeError('Unable to get ugDataContent.store');
	const page       = store.page;          if (page       === null) throw new ScrapeError('Unable to get ugDataContent.store.page');
	const data       = page.data;           if (data       === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');
	const pagination = data.pagination;     if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');
	const pagenum    = pagination.current;  if (pagenum    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');
	const pages      = pagination.pages;    if (pages      === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');

	const albumTabs    = data.album_tabs;     if (albumTabs    === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');
	const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');
	const featTabs     = data.feat_tabs;      if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');
	const otherTabs    = data.other_tabs;     if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');

	return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };
}

// Returns a list of tab metadata (including tab URL)
async function scrapeAllArtistTabListPages(startURL) {
	let tabs = [];
	let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
	while (true) {
		//console.log('scraping artist page: ' + url.toString());
		const ugDataContent = await scrapeUGDataContent(url.toString());
		const page = parseArtistPage(ugDataContent);
		
		tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);
		
		const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);
		if (nextPageData == null) break;
		url = new URL(nextPageData.url, url);

		await fuzzyDelay();
	}

	// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
	const uniqueTabIds = new Set();
	const uniqueTabs = [];
	for (let tab of tabs) {
		if (uniqueTabIds.has(tab.id)) continue;
		uniqueTabIds.add(tab.id);
		uniqueTabs.push(tab);
	}

	return uniqueTabs;
}

// Returns a list of artist metadata (including artist tab list URL)
async function scrapeAllBandListPages(startURL) {
	let artists = [];

	// https://www.ultimate-guitar.com/bands/d.htm
	let url = new URL(startURL);
	let startTime = Date.now();

	while (true) {
		const ugDataContent = await scrapeUGDataContent(url.toString());
		const page = parseBandsPage(ugDataContent);

		artists = artists.concat(page.artists);

		let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));
		console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);

		if (page.pagenum + 1 > page.pagecnt) break;
		url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...

		await fuzzyDelay();
	}

	return artists;
}

async function saveBandList(filename, url) {
	let artists = await scrapeAllBandListPages(url);
	await saveJsonData(filename, artists);
}

// nigger :)

async function saveBandLists(urls) {
	let startTime = Date.now();
	let completed = 0;
	for (const url of urls) {
		console.log('doing band list: ' + url);
		await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);

		completed += 1;
		let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));
		console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);

		await fuzzyDelay();
	}
}

// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
// of the artist's tab metadatas (tab text is done in a different step)
async function saveArtistsWithTabMetadata(filename, artists) {
	const baseURL = 'https://www.ultimate-guitar.com/';

	let startTime = Date.now();
	let completed = 0;
	let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time
	// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
	for (let artist of artists) {
		taskQueue.push(async () => {
			let artistStartURL = new URL(artist.artist_url, baseURL);
			let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);
			artist.tabs = artistTabs;
	
			completed += 1;
			let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));
			let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
			let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);
			console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
		});
	}

	await taskQueue.waitForDrain();

	await saveJsonData(filename, artists);
}

module.exports = {
	scrapeBands: saveBandLists,
	scrapeArtistTabUrls: saveArtistsWithTabMetadata
};
tab-yoinker initial commit 2021-09-24 04:15:03 +00:00			`const fs = require('fs/promises');`
			`const path = require('path');`

			`const jsdom = require('jsdom');`
			`const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));`

			`const ConcurrentQueue = require('./concurrent-queue.js');`

			`class ScrapeError extends Error {`
			`constructor(message, options, fileName, lineNumber) {`
			`super(...arguments);`
			`this.name = 'ScrapeError';`
			`}`
			`}`

			`// From Cordis util.js`
			`function sanitizeFileName(name) {`
			`// Windows Version (created for Windows, most likely works cross-platform too given my research)`
			`// Allowed Characters: Extended Unicode Charset (1-255)`
			`// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9`
			`// Reserved Characters: <>:"/\\|?*`
			`// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names`

			`// Illegal File Names (Windows)`
			`if ([ 'CON', 'PRN', 'AUX', 'NUL',`
			`'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9',`
			`'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive?`
			`name += '_';`
			`}`
			`// Reserved Characters`
			`name = name.replace(/[<>:\"\/\\\|?*]/g, '');`
			`// Allowed Characters`
			`return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join('');`

			`// Much stricter whitelist version`
			`// replace bad characters with '_'`
			`//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');`
			`}`

			`function estMSRemaining(startTime, ratioComplete) {`
			`return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete);`
			`}`

			`function formatRelative(msRelative) {`
			if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`;
			else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`;
			else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`;
			else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`;
			`}`

			`async function sleep(ms) {`
			`return new Promise((resolve) => {`
			`setTimeout(resolve, ms);`
			`});`
			`}`

			`async function fuzzyDelay() {`
			`await sleep(500 + (500 * Math.random()));`
			`}`

			`async function saveJsonData(filename, dataJSON) {`
			`await fs.writeFile(filename, JSON.stringify(dataJSON));`
			`}`

			`// Note: This is the key scraper function. It scrapes the .js-store's data`
			`async function scrapeUGDataContent(url) {`
			`let page = null;`
			`try {`
			`page = await fetch(url);`
			`} catch (e) {`
			`throw new ScrapeError('Unable to fetch url', { cause: e });`
			`}`

			`let text = null;`
			`try {`
			`text = await page.text();`
			`} catch (e) {`
			`throw new ScrapeError('Unable to decode page', { cause: e });`
			`}`

			`let dom = null;`
			`try {`
			`dom = new jsdom.JSDOM(text);`
			`} catch (e) {`
			`throw new ScrapeError('Unable to parse document', { cause: e });`
			`}`

			`if (!dom.window \|\| !dom.window.document) {`
			`throw new ScrapeError('Unable to parse document');`
			`}`

			`let document = dom.window.document;`

			`const jsStore = document.querySelector('.js-store');`
			`if (jsStore == null) {`
			`throw new ScrapeError('Unable to find .js-store element for ' + url);`
			`}`

			`const contentJSON = jsStore.getAttribute('data-content');`
			`if (contentJSON == null) {`
			`throw new ScrapeError('Unable to find data-content attribute on .js-store');`
			`}`

			`const content = JSON.parse(contentJSON);`
			`return content;`
			`}`

			`function parseTab(ugDataContent) {`
			`const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');`
			`const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');`
			`const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');`
			`const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab');`
			`const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view');`
			`const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab');`
			`const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content');`
			`return { meta, text };`
			`}`

			`function parseBandsPage(ugDataContent) {`
			`const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');`
			`const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');`
			`const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');`
			`const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha');`
			`const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists');`
			`const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page');`
			`const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count');`
			`return { alpha, artists, pagenum, pagecnt };`
			`}`

			`function parseArtistPage(ugDataContent) {`
			`const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store');`
			`const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page');`
			`const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data');`
			`const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination');`
			`const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current');`
			`const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages');`

			`const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs');`
			`const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs');`
			`const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs');`
			`const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs');`

			`return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages };`
			`}`

			`// Returns a list of tab metadata (including tab URL)`
			`async function scrapeAllArtistTabListPages(startURL) {`
			`let tabs = [];`
			`let url = new URL(startURL); // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.`
			`while (true) {`
			`//console.log('scraping artist page: ' + url.toString());`
			`const ugDataContent = await scrapeUGDataContent(url.toString());`
			`const page = parseArtistPage(ugDataContent);`

			`tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs);`

			`const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1);`
			`if (nextPageData == null) break;`
			`url = new URL(nextPageData.url, url);`

			`await fuzzyDelay();`
			`}`

			`// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates`
			`const uniqueTabIds = new Set();`
			`const uniqueTabs = [];`
			`for (let tab of tabs) {`
			`if (uniqueTabIds.has(tab.id)) continue;`
			`uniqueTabIds.add(tab.id);`
			`uniqueTabs.push(tab);`
			`}`

			`return uniqueTabs;`
			`}`

			`// Returns a list of artist metadata (including artist tab list URL)`
			`async function scrapeAllBandListPages(startURL) {`
			`let artists = [];`

			`// https://www.ultimate-guitar.com/bands/d.htm`
			`let url = new URL(startURL);`
			`let startTime = Date.now();`

			`while (true) {`
			`const ugDataContent = await scrapeUGDataContent(url.toString());`
			`const page = parseBandsPage(ugDataContent);`

			`artists = artists.concat(page.artists);`

			`let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt));`
			console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`);

			`if (page.pagenum + 1 > page.pagecnt) break;`
			`url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ...`

			`await fuzzyDelay();`
			`}`

			`return artists;`
			`}`

			`async function saveBandList(filename, url) {`
			`let artists = await scrapeAllBandListPages(url);`
			`await saveJsonData(filename, artists);`
			`}`

			`// nigger :)`

			`async function saveBandLists(urls) {`
			`let startTime = Date.now();`
			`let completed = 0;`
			`for (const url of urls) {`
			`console.log('doing band list: ' + url);`
			`await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url);`

			`completed += 1;`
			`let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length));`
			console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`);

			`await fuzzyDelay();`
			`}`
			`}`

			`// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list`
			`// of the artist's tab metadatas (tab text is done in a different step)`
			`async function saveArtistsWithTabMetadata(filename, artists) {`
			`const baseURL = 'https://www.ultimate-guitar.com/';`

			`let startTime = Date.now();`
			`let completed = 0;`
			`let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time`
			`// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output`
			`for (let artist of artists) {`
			`taskQueue.push(async () => {`
			`let artistStartURL = new URL(artist.artist_url, baseURL);`
			`let artistTabs = await scrapeAllArtistTabListPages(artistStartURL);`
			`artist.tabs = artistTabs;`

			`completed += 1;`
			`let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length));`
			`let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);`
			`let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2);`
			console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`);
			`});`
			`}`

			`await taskQueue.waitForDrain();`

			`await saveJsonData(filename, artists);`
			`}`

			`module.exports = {`
			`scrapeBands: saveBandLists,`
			`scrapeArtistTabUrls: saveArtistsWithTabMetadata`
			`};`