const fs = require('fs/promises'); const path = require('path'); const jsdom = require('jsdom'); const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); const ConcurrentQueue = require('./concurrent-queue.js'); class ScrapeError extends Error { constructor(message, options, fileName, lineNumber) { super(...arguments); this.name = 'ScrapeError'; } } // From Cordis util.js function sanitizeFileName(name) { // Windows Version (created for Windows, most likely works cross-platform too given my research) // Allowed Characters: Extended Unicode Charset (1-255) // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 // Reserved Characters: <>:"/\|?* // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names // Illegal File Names (Windows) if ([ 'CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? name += '_'; } // Reserved Characters name = name.replace(/[<>:\"\/\\|?*]/g, ''); // Allowed Characters return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); // Much stricter whitelist version // replace bad characters with '_' //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); } function estMSRemaining(startTime, ratioComplete) { return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); } function formatRelative(msRelative) { if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; } async function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); }); } async function fuzzyDelay() { await sleep(500 + (500 * Math.random())); } async function saveJsonData(filename, dataJSON) { await fs.writeFile(filename, JSON.stringify(dataJSON)); } // Note: This is the key scraper function. It scrapes the .js-store's data async function scrapeUGDataContent(url) { let page = null; try { page = await fetch(url); } catch (e) { throw new ScrapeError('Unable to fetch url', { cause: e }); } let text = null; try { text = await page.text(); } catch (e) { throw new ScrapeError('Unable to decode page', { cause: e }); } let dom = null; try { dom = new jsdom.JSDOM(text); } catch (e) { throw new ScrapeError('Unable to parse document', { cause: e }); } if (!dom.window || !dom.window.document) { throw new ScrapeError('Unable to parse document'); } let document = dom.window.document; const jsStore = document.querySelector('.js-store'); if (jsStore == null) { throw new ScrapeError('Unable to find .js-store element for ' + url); } const contentJSON = jsStore.getAttribute('data-content'); if (contentJSON == null) { throw new ScrapeError('Unable to find data-content attribute on .js-store'); } const content = JSON.parse(contentJSON); return content; } function parseTab(ugDataContent) { const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); return { meta, text }; } function parseBandsPage(ugDataContent) { const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha'); const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists'); const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page'); const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count'); return { alpha, artists, pagenum, pagecnt }; } function parseArtistPage(ugDataContent) { const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination'); const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current'); const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages'); const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs'); const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs'); const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs'); const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs'); return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages }; } // Returns a list of tab metadata (including tab URL) async function scrapeAllArtistTabListPages(startURL) { let tabs = []; let url = new URL(startURL); // Note: not considering the tag, would have to change the implementation if this gets used somewhere. while (true) { //console.log('scraping artist page: ' + url.toString()); const ugDataContent = await scrapeUGDataContent(url.toString()); const page = parseArtistPage(ugDataContent); tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs); const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1); if (nextPageData == null) break; url = new URL(nextPageData.url, url); await fuzzyDelay(); } // the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates const uniqueTabIds = new Set(); const uniqueTabs = []; for (let tab of tabs) { if (uniqueTabIds.has(tab.id)) continue; uniqueTabIds.add(tab.id); uniqueTabs.push(tab); } return uniqueTabs; } // Returns a list of artist metadata (including artist tab list URL) async function scrapeAllBandListPages(startURL) { let artists = []; // https://www.ultimate-guitar.com/bands/d.htm let url = new URL(startURL); let startTime = Date.now(); while (true) { const ugDataContent = await scrapeUGDataContent(url.toString()); const page = parseBandsPage(ugDataContent); artists = artists.concat(page.artists); let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt)); console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`); if (page.pagenum + 1 > page.pagecnt) break; url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ... await fuzzyDelay(); } return artists; } async function saveBandList(filename, url) { let artists = await scrapeAllBandListPages(url); await saveJsonData(filename, artists); } // nigger :) async function saveBandLists(urls) { let startTime = Date.now(); let completed = 0; for (const url of urls) { console.log('doing band list: ' + url); await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url); completed += 1; let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length)); console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`); await fuzzyDelay(); } } // Note: modifies artists to add a 'tabs' property to each artist. This property contains a list // of the artist's tab metadatas (tab text is done in a different step) async function saveArtistsWithTabMetadata(filename, artists) { const baseURL = 'https://www.ultimate-guitar.com/'; let startTime = Date.now(); let completed = 0; let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time // Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output for (let artist of artists) { taskQueue.push(async () => { let artistStartURL = new URL(artist.artist_url, baseURL); let artistTabs = await scrapeAllArtistTabListPages(artistStartURL); artist.tabs = artistTabs; completed += 1; let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length)); let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`); }); } await taskQueue.waitForDrain(); await saveJsonData(filename, artists); } module.exports = { scrapeBands: saveBandLists, scrapeArtistTabUrls: saveArtistsWithTabMetadata };