From 87a424494bfbebd8024792bed62dfe1af5dedfaa Mon Sep 17 00:00:00 2001 From: Anonymous <> Date: Mon, 1 Aug 2022 19:03:29 -0500 Subject: [PATCH] these changes were sitting for a while --- .gitignore | 18 +- 01-scraper-urls/01-scrape-bands.js | 72 +-- 01-scraper-urls/02-scrape-artist-tab-urls.js | 86 +-- 01-scraper-urls/concurrent-queue.js | 96 ++-- 01-scraper-urls/url-scraper.js | 510 ++++++++--------- 02-ingest-sqlite/01-injest-sqlite.js | 252 ++++----- 04-scraper-tabs/01-scrape-tabs.js | 482 ++++++++-------- 04-scraper-tabs/concurrent-queue.js | 96 ++-- 06-output-generator/01-output-generator.js | 261 ++++----- README.md | 548 +++++++++---------- 10 files changed, 1215 insertions(+), 1206 deletions(-) diff --git a/.gitignore b/.gitignore index 5187cd7..2b3e854 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,9 @@ -**/node_modules -**/input/**/*.db -**/input/**/*.db-journal -**/input/**/*.json -**/output/**/*.db -**/output/**/*.db-journal -**/output/**/*.json -**/output/**/*.txt -**/*.7z +**/node_modules +**/input/**/*.db +**/input/**/*.db-journal +**/input/**/*.json +**/output/**/*.db +**/output/**/*.db-journal +**/output/**/*.json +**/output/**/*.txt +**/*.7z diff --git a/01-scraper-urls/01-scrape-bands.js b/01-scraper-urls/01-scrape-bands.js index 966abfd..98663b1 100644 --- a/01-scraper-urls/01-scrape-bands.js +++ b/01-scraper-urls/01-scrape-bands.js @@ -1,36 +1,36 @@ -const Scraper = require('./url-scraper.js'); - -// Comment out the urls that you have already scraped -const bandListUrls = [ - 'https://www.ultimate-guitar.com/bands/0-9.htm', - // 'https://www.ultimate-guitar.com/bands/a.htm', - // 'https://www.ultimate-guitar.com/bands/b.htm', - // 'https://www.ultimate-guitar.com/bands/c.htm', - // 'https://www.ultimate-guitar.com/bands/d.htm', - // 'https://www.ultimate-guitar.com/bands/e.htm', - // 'https://www.ultimate-guitar.com/bands/f.htm', - // 'https://www.ultimate-guitar.com/bands/g.htm', - // 'https://www.ultimate-guitar.com/bands/h.htm', - // 'https://www.ultimate-guitar.com/bands/i.htm', - // 'https://www.ultimate-guitar.com/bands/j.htm', - // 'https://www.ultimate-guitar.com/bands/k.htm', - // 'https://www.ultimate-guitar.com/bands/l.htm', - // 'https://www.ultimate-guitar.com/bands/m.htm', - // 'https://www.ultimate-guitar.com/bands/n.htm', - // 'https://www.ultimate-guitar.com/bands/o.htm', - // 'https://www.ultimate-guitar.com/bands/p.htm', - // 'https://www.ultimate-guitar.com/bands/q.htm', - // 'https://www.ultimate-guitar.com/bands/r.htm', - // 'https://www.ultimate-guitar.com/bands/s.htm', - // 'https://www.ultimate-guitar.com/bands/t.htm', - // 'https://www.ultimate-guitar.com/bands/u.htm', - // 'https://www.ultimate-guitar.com/bands/v.htm', - // 'https://www.ultimate-guitar.com/bands/w.htm', - // 'https://www.ultimate-guitar.com/bands/x.htm', - // 'https://www.ultimate-guitar.com/bands/y.htm', - // 'https://www.ultimate-guitar.com/bands/z.htm', -]; - -(async () => { - await Scraper.scrapeBands(bandListUrls); -})(); +const Scraper = require('./url-scraper.js'); + +// Comment out the urls that you have already scraped +const bandListUrls = [ + 'https://www.ultimate-guitar.com/bands/0-9.htm', + // 'https://www.ultimate-guitar.com/bands/a.htm', + // 'https://www.ultimate-guitar.com/bands/b.htm', + // 'https://www.ultimate-guitar.com/bands/c.htm', + // 'https://www.ultimate-guitar.com/bands/d.htm', + // 'https://www.ultimate-guitar.com/bands/e.htm', + // 'https://www.ultimate-guitar.com/bands/f.htm', + // 'https://www.ultimate-guitar.com/bands/g.htm', + // 'https://www.ultimate-guitar.com/bands/h.htm', + // 'https://www.ultimate-guitar.com/bands/i.htm', + // 'https://www.ultimate-guitar.com/bands/j.htm', + // 'https://www.ultimate-guitar.com/bands/k.htm', + // 'https://www.ultimate-guitar.com/bands/l.htm', + // 'https://www.ultimate-guitar.com/bands/m.htm', + // 'https://www.ultimate-guitar.com/bands/n.htm', + // 'https://www.ultimate-guitar.com/bands/o.htm', + // 'https://www.ultimate-guitar.com/bands/p.htm', + // 'https://www.ultimate-guitar.com/bands/q.htm', + // 'https://www.ultimate-guitar.com/bands/r.htm', + // 'https://www.ultimate-guitar.com/bands/s.htm', + // 'https://www.ultimate-guitar.com/bands/t.htm', + // 'https://www.ultimate-guitar.com/bands/u.htm', + // 'https://www.ultimate-guitar.com/bands/v.htm', + // 'https://www.ultimate-guitar.com/bands/w.htm', + // 'https://www.ultimate-guitar.com/bands/x.htm', + // 'https://www.ultimate-guitar.com/bands/y.htm', + // 'https://www.ultimate-guitar.com/bands/z.htm', +]; + +(async () => { + await Scraper.scrapeBands(bandListUrls); +})(); diff --git a/01-scraper-urls/02-scrape-artist-tab-urls.js b/01-scraper-urls/02-scrape-artist-tab-urls.js index 97bfbe0..a0877a5 100644 --- a/01-scraper-urls/02-scrape-artist-tab-urls.js +++ b/01-scraper-urls/02-scrape-artist-tab-urls.js @@ -1,43 +1,43 @@ -const fs = require('fs/promises'); -const path = require('path'); - -const Scraper = require('./url-scraper.js'); - -// Comment out the artist files that you have already scraped -const artistListFiles = [ - 'output/artists/httpswww.ultimate-guitar.combands0-9.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsa.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsb.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsc.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsd.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandse.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsf.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsg.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsh.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsi.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsj.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsk.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsl.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsm.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsn.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandso.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsp.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsq.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsr.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandss.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandst.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsu.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsv.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsw.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsx.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsy.htm.json', - // 'output/artists/httpswww.ultimate-guitar.combandsz.htm.json', -]; - -(async () => { - let num = 0; - for (let file of artistListFiles) { - let artists = JSON.parse(await fs.readFile(file)); - await Scraper.scrapeArtistTabUrls(path.join('output', 'artists-with-tabs', 'artists-part-' + num + '.json'), artists); - } -})(); +const fs = require('fs/promises'); +const path = require('path'); + +const Scraper = require('./url-scraper.js'); + +// Comment out the artist files that you have already scraped +const artistListFiles = [ + 'output/artists/httpswww.ultimate-guitar.combands0-9.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsa.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsb.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsc.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsd.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandse.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsf.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsg.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsh.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsi.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsj.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsk.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsl.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsm.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsn.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandso.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsp.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsq.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsr.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandss.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandst.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsu.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsv.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsw.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsx.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsy.htm.json', + // 'output/artists/httpswww.ultimate-guitar.combandsz.htm.json', +]; + +(async () => { + let num = 0; + for (let file of artistListFiles) { + let artists = JSON.parse(await fs.readFile(file)); + await Scraper.scrapeArtistTabUrls(path.join('output', 'artists-with-tabs', 'artists-part-' + num + '.json'), artists); + } +})(); diff --git a/01-scraper-urls/concurrent-queue.js b/01-scraper-urls/concurrent-queue.js index c1c3abd..09cb51b 100644 --- a/01-scraper-urls/concurrent-queue.js +++ b/01-scraper-urls/concurrent-queue.js @@ -1,48 +1,48 @@ -// Runs a limited number of promises at one time -class ConcurrentQueue { - constructor(consecutive) { - this.consecutive = consecutive; - this.queue = []; - this.current = 0; - this.drainListeners = []; - } - - _checkQueue() { - if (this.current == 0 && this.queue.length == 0) { - for (let drainListener of this.drainListeners) { - drainListener(); - } - this.drainListeners = []; - } - while (this.current < this.consecutive && this.queue.length > 0) { - let taskData = this.queue.shift(); - this.current += 1; - (async () => { - try { - taskData.resolve(await taskData.task()); - } catch (e) { - taskData.reject(e); - } - this.current -= 1; - this._checkQueue(); - })(); - } - } - - // returns a promise that can be awaited to get the resolution or rejection of the task's execution - push(task) { - return new Promise((resolve, reject) => { - this.queue.push({ task, resolve, reject }) - this._checkQueue(); - }); - } - - async waitForDrain() { - return new Promise((resolve) => { - this.drainListeners.push(resolve); - this._checkQueue(); - }); - } -} - -module.exports = ConcurrentQueue; +// Runs a limited number of promises at one time +class ConcurrentQueue { + constructor(consecutive) { + this.consecutive = consecutive; + this.queue = []; + this.current = 0; + this.drainListeners = []; + } + + _checkQueue() { + if (this.current == 0 && this.queue.length == 0) { + for (let drainListener of this.drainListeners) { + drainListener(); + } + this.drainListeners = []; + } + while (this.current < this.consecutive && this.queue.length > 0) { + let taskData = this.queue.shift(); + this.current += 1; + (async () => { + try { + taskData.resolve(await taskData.task()); + } catch (e) { + taskData.reject(e); + } + this.current -= 1; + this._checkQueue(); + })(); + } + } + + // returns a promise that can be awaited to get the resolution or rejection of the task's execution + push(task) { + return new Promise((resolve, reject) => { + this.queue.push({ task, resolve, reject }) + this._checkQueue(); + }); + } + + async waitForDrain() { + return new Promise((resolve) => { + this.drainListeners.push(resolve); + this._checkQueue(); + }); + } +} + +module.exports = ConcurrentQueue; diff --git a/01-scraper-urls/url-scraper.js b/01-scraper-urls/url-scraper.js index c5fd3eb..4f339d3 100644 --- a/01-scraper-urls/url-scraper.js +++ b/01-scraper-urls/url-scraper.js @@ -1,255 +1,255 @@ -const fs = require('fs/promises'); -const path = require('path'); - -const jsdom = require('jsdom'); -const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); - -const ConcurrentQueue = require('./concurrent-queue.js'); - -class ScrapeError extends Error { - constructor(message, options, fileName, lineNumber) { - super(...arguments); - this.name = 'ScrapeError'; - } -} - -// From Cordis util.js -function sanitizeFileName(name) { - // Windows Version (created for Windows, most likely works cross-platform too given my research) - // Allowed Characters: Extended Unicode Charset (1-255) - // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 - // Reserved Characters: <>:"/\|?* - // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names - - // Illegal File Names (Windows) - if ([ 'CON', 'PRN', 'AUX', 'NUL', - 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', - 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? - name += '_'; - } - // Reserved Characters - name = name.replace(/[<>:\"\/\\|?*]/g, ''); - // Allowed Characters - return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); - - // Much stricter whitelist version - // replace bad characters with '_' - //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); -} - -function estMSRemaining(startTime, ratioComplete) { - return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); -} - -function formatRelative(msRelative) { - if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; - else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; - else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; - else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; -} - -async function sleep(ms) { - return new Promise((resolve) => { - setTimeout(resolve, ms); - }); -} - -async function fuzzyDelay() { - await sleep(500 + (500 * Math.random())); -} - -async function saveJsonData(filename, dataJSON) { - await fs.writeFile(filename, JSON.stringify(dataJSON)); -} - -// Note: This is the key scraper function. It scrapes the .js-store's data -async function scrapeUGDataContent(url) { - let page = null; - try { - page = await fetch(url); - } catch (e) { - throw new ScrapeError('Unable to fetch url', { cause: e }); - } - - let text = null; - try { - text = await page.text(); - } catch (e) { - throw new ScrapeError('Unable to decode page', { cause: e }); - } - - let dom = null; - try { - dom = new jsdom.JSDOM(text); - } catch (e) { - throw new ScrapeError('Unable to parse document', { cause: e }); - } - - if (!dom.window || !dom.window.document) { - throw new ScrapeError('Unable to parse document'); - } - - let document = dom.window.document; - - const jsStore = document.querySelector('.js-store'); - if (jsStore == null) { - throw new ScrapeError('Unable to find .js-store element for ' + url); - } - - const contentJSON = jsStore.getAttribute('data-content'); - if (contentJSON == null) { - throw new ScrapeError('Unable to find data-content attribute on .js-store'); - } - - const content = JSON.parse(contentJSON); - return content; -} - -function parseTab(ugDataContent) { - const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); - const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); - const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); - const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); - const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); - const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); - const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); - return { meta, text }; -} - -function parseBandsPage(ugDataContent) { - const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); - const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); - const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); - const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha'); - const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists'); - const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page'); - const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count'); - return { alpha, artists, pagenum, pagecnt }; -} - -function parseArtistPage(ugDataContent) { - const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); - const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); - const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); - const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination'); - const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current'); - const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages'); - - const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs'); - const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs'); - const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs'); - const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs'); - - return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages }; -} - -// Returns a list of tab metadata (including tab URL) -async function scrapeAllArtistTabListPages(startURL) { - let tabs = []; - let url = new URL(startURL); // Note: not considering the tag, would have to change the implementation if this gets used somewhere. - while (true) { - //console.log('scraping artist page: ' + url.toString()); - const ugDataContent = await scrapeUGDataContent(url.toString()); - const page = parseArtistPage(ugDataContent); - - tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs); - - const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1); - if (nextPageData == null) break; - url = new URL(nextPageData.url, url); - - await fuzzyDelay(); - } - - // the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates - const uniqueTabIds = new Set(); - const uniqueTabs = []; - for (let tab of tabs) { - if (uniqueTabIds.has(tab.id)) continue; - uniqueTabIds.add(tab.id); - uniqueTabs.push(tab); - } - - return uniqueTabs; -} - -// Returns a list of artist metadata (including artist tab list URL) -async function scrapeAllBandListPages(startURL) { - let artists = []; - - // https://www.ultimate-guitar.com/bands/d.htm - let url = new URL(startURL); - let startTime = Date.now(); - - while (true) { - const ugDataContent = await scrapeUGDataContent(url.toString()); - const page = parseBandsPage(ugDataContent); - - artists = artists.concat(page.artists); - - let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt)); - console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`); - - if (page.pagenum + 1 > page.pagecnt) break; - url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ... - - await fuzzyDelay(); - } - - return artists; -} - -async function saveBandList(filename, url) { - let artists = await scrapeAllBandListPages(url); - await saveJsonData(filename, artists); -} - -// nigger :) - -async function saveBandLists(urls) { - let startTime = Date.now(); - let completed = 0; - for (const url of urls) { - console.log('doing band list: ' + url); - await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url); - - completed += 1; - let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length)); - console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`); - - await fuzzyDelay(); - } -} - -// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list -// of the artist's tab metadatas (tab text is done in a different step) -async function saveArtistsWithTabMetadata(filename, artists) { - const baseURL = 'https://www.ultimate-guitar.com/'; - - let startTime = Date.now(); - let completed = 0; - let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time - // Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output - for (let artist of artists) { - taskQueue.push(async () => { - let artistStartURL = new URL(artist.artist_url, baseURL); - let artistTabs = await scrapeAllArtistTabListPages(artistStartURL); - artist.tabs = artistTabs; - - completed += 1; - let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length)); - let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); - let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); - console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`); - }); - } - - await taskQueue.waitForDrain(); - - await saveJsonData(filename, artists); -} - -module.exports = { - scrapeBands: saveBandLists, - scrapeArtistTabUrls: saveArtistsWithTabMetadata -}; +const fs = require('fs/promises'); +const path = require('path'); + +const jsdom = require('jsdom'); +const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); + +const ConcurrentQueue = require('./concurrent-queue.js'); + +class ScrapeError extends Error { + constructor(message, options, fileName, lineNumber) { + super(...arguments); + this.name = 'ScrapeError'; + } +} + +// From Cordis util.js +function sanitizeFileName(name) { + // Windows Version (created for Windows, most likely works cross-platform too given my research) + // Allowed Characters: Extended Unicode Charset (1-255) + // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 + // Reserved Characters: <>:"/\|?* + // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names + + // Illegal File Names (Windows) + if ([ 'CON', 'PRN', 'AUX', 'NUL', + 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', + 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? + name += '_'; + } + // Reserved Characters + name = name.replace(/[<>:\"\/\\|?*]/g, ''); + // Allowed Characters + return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); + + // Much stricter whitelist version + // replace bad characters with '_' + //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); +} + +function estMSRemaining(startTime, ratioComplete) { + return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); +} + +function formatRelative(msRelative) { + if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; + else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; + else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; + else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; +} + +async function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +async function fuzzyDelay() { + await sleep(500 + (500 * Math.random())); +} + +async function saveJsonData(filename, dataJSON) { + await fs.writeFile(filename, JSON.stringify(dataJSON)); +} + +// Note: This is the key scraper function. It scrapes the .js-store's data +async function scrapeUGDataContent(url) { + let page = null; + try { + page = await fetch(url); + } catch (e) { + throw new ScrapeError('Unable to fetch url', { cause: e }); + } + + let text = null; + try { + text = await page.text(); + } catch (e) { + throw new ScrapeError('Unable to decode page', { cause: e }); + } + + let dom = null; + try { + dom = new jsdom.JSDOM(text); + } catch (e) { + throw new ScrapeError('Unable to parse document', { cause: e }); + } + + if (!dom.window || !dom.window.document) { + throw new ScrapeError('Unable to parse document'); + } + + let document = dom.window.document; + + const jsStore = document.querySelector('.js-store'); + if (jsStore == null) { + throw new ScrapeError('Unable to find .js-store element for ' + url); + } + + const contentJSON = jsStore.getAttribute('data-content'); + if (contentJSON == null) { + throw new ScrapeError('Unable to find data-content attribute on .js-store'); + } + + const content = JSON.parse(contentJSON); + return content; +} + +function parseTab(ugDataContent) { + const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); + const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); + const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); + const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); + const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); + const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); + const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); + return { meta, text }; +} + +function parseBandsPage(ugDataContent) { + const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); + const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); + const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); + const alpha = data.alpha; if (alpha === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.alpha'); + const artists = data.artists; if (artists === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.artists'); + const pagenum = data.current_page; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.current_page'); + const pagecnt = data.page_count; if (pagecnt === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.page_count'); + return { alpha, artists, pagenum, pagecnt }; +} + +function parseArtistPage(ugDataContent) { + const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); + const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); + const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); + const pagination = data.pagination; if (pagination === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination'); + const pagenum = pagination.current; if (pagenum === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.current'); + const pages = pagination.pages; if (pages === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.pagination.pages'); + + const albumTabs = data.album_tabs; if (albumTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.album_tabs'); + const chordProTabs = data.chord_pro_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.chord_pro_tabs'); + const featTabs = data.feat_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.feat_tabs'); + const otherTabs = data.other_tabs; if (chordProTabs === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.other_tabs'); + + return { albumTabs, chordProTabs, featTabs, otherTabs, pagenum, pages }; +} + +// Returns a list of tab metadata (including tab URL) +async function scrapeAllArtistTabListPages(startURL) { + let tabs = []; + let url = new URL(startURL); // Note: not considering the tag, would have to change the implementation if this gets used somewhere. + while (true) { + //console.log('scraping artist page: ' + url.toString()); + const ugDataContent = await scrapeUGDataContent(url.toString()); + const page = parseArtistPage(ugDataContent); + + tabs = tabs.concat(page.albumTabs, page.chordProTabs, page.featTabs, page.otherTabs); + + const nextPageData = page.pages.find(pageData => pageData.page == page.pagenum + 1); + if (nextPageData == null) break; + url = new URL(nextPageData.url, url); + + await fuzzyDelay(); + } + + // the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates + const uniqueTabIds = new Set(); + const uniqueTabs = []; + for (let tab of tabs) { + if (uniqueTabIds.has(tab.id)) continue; + uniqueTabIds.add(tab.id); + uniqueTabs.push(tab); + } + + return uniqueTabs; +} + +// Returns a list of artist metadata (including artist tab list URL) +async function scrapeAllBandListPages(startURL) { + let artists = []; + + // https://www.ultimate-guitar.com/bands/d.htm + let url = new URL(startURL); + let startTime = Date.now(); + + while (true) { + const ugDataContent = await scrapeUGDataContent(url.toString()); + const page = parseBandsPage(ugDataContent); + + artists = artists.concat(page.artists); + + let fromNow = formatRelative(estMSRemaining(startTime, page.pagenum / page.pagecnt)); + console.log(`Band List Status: ${page.pagenum} / ${page.pagecnt} pages complete (${(page.pagenum / page.pagecnt * 100).toFixed(2)}%, ${fromNow} remaining)`); + + if (page.pagenum + 1 > page.pagecnt) break; + url = new URL(startURL.slice(0, -4) + (page.pagenum + 1) + '.htm'); // d.htm (start) -> d2.htm -> d3.htm -> ... + + await fuzzyDelay(); + } + + return artists; +} + +async function saveBandList(filename, url) { + let artists = await scrapeAllBandListPages(url); + await saveJsonData(filename, artists); +} + +// nigger :) + +async function saveBandLists(urls) { + let startTime = Date.now(); + let completed = 0; + for (const url of urls) { + console.log('doing band list: ' + url); + await saveBandList(path.join('output', 'artists', sanitizeFileName(url + '.json')), url); + + completed += 1; + let fromNow = formatRelative(estMSRemaining(startTime, completed / urls.length)); + console.log(`Save All Band List Status: ${completed} / ${urls.length} band lists complete (${(completed / urls.length * 100).toFixed(2)}%, ${fromNow} remaining)`); + + await fuzzyDelay(); + } +} + +// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list +// of the artist's tab metadatas (tab text is done in a different step) +async function saveArtistsWithTabMetadata(filename, artists) { + const baseURL = 'https://www.ultimate-guitar.com/'; + + let startTime = Date.now(); + let completed = 0; + let taskQueue = new ConcurrentQueue(8); // Run a maximum of 4 artist tab list scrapers at a time + // Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output + for (let artist of artists) { + taskQueue.push(async () => { + let artistStartURL = new URL(artist.artist_url, baseURL); + let artistTabs = await scrapeAllArtistTabListPages(artistStartURL); + artist.tabs = artistTabs; + + completed += 1; + let fromNow = formatRelative(estMSRemaining(startTime, completed / artists.length)); + let pctPerMin = ((100 * completed / artists.length) / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); + let artistsPerMin = (completed / ((Date.now() - startTime) / (60 * 1000))).toFixed(2); + console.log(`Save Artists with Tab Metadata Status: ${completed} / ${artists.length} artists complete (${(completed / artists.length * 100).toFixed(2)}%, ${fromNow} remaining, ${pctPerMin} %/min, ${artistsPerMin} artists/min)`); + }); + } + + await taskQueue.waitForDrain(); + + await saveJsonData(filename, artists); +} + +module.exports = { + scrapeBands: saveBandLists, + scrapeArtistTabUrls: saveArtistsWithTabMetadata +}; diff --git a/02-ingest-sqlite/01-injest-sqlite.js b/02-ingest-sqlite/01-injest-sqlite.js index ab65aa7..dd64f99 100644 --- a/02-ingest-sqlite/01-injest-sqlite.js +++ b/02-ingest-sqlite/01-injest-sqlite.js @@ -1,126 +1,126 @@ -// Injests from ./input/* into ./output/tabs.db - -const sqlite3 = require('sqlite3'); -const sqlite = require('sqlite'); - -const fs = require('fs/promises'); - -(async () => { - const db = await sqlite.open({ - driver: sqlite3.Database, - filename: './output/tabs-no-text.db' - }); - - await db.run(` - CREATE TABLE IF NOT EXISTS artists ( - scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT - , id INTEGER - , name TEXT - , tabscount INTEGER - , artist_url TEXT - , tabs_last_update_timestamp INTEGER - ) - `); - - await db.run(` - CREATE TABLE IF NOT EXISTS tabs ( - scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT - , artist_scrape_id INTEGER NOT NULL - , id INTEGER - , song_id INTEGER - , song_name TEXT - , artist_id INTEGER - , artist_name INTEGER - , type TEXT - , part TEXT - , version INTEGER - , votes INTEGER - , rating NUMERIC - , date TEXT - , status TEXT - , preset_id INTEGER - , tab_access_type TEXT - , tp_version INTEGER - , tonality_name TEXT - , version_description TEXT - , verified INTEGER - , artist_url TEXT - , tab_url TEXT - , tab_text TEXT - , difficulty TEXT - , tuning TEXT - , type_name TEXT - , FOREIGN KEY (artist_scrape_id) REFERENCES artists(scrape_id) - ) - `); - - // Clear out the database - await db.run('DELETE FROM tabs'); - await db.run('DELETE FROM artists'); - - const files = await fs.readdir('./input/'); - - const stmtAddArtist = await db.prepare(` - INSERT INTO artists ( - id, name, tabscount, artist_url, tabs_last_update_timestamp - ) VALUES ( - ?1, ?2, ?3, ?4, ?5 - ) - `); - const stmtAddTab = await db.prepare(` - INSERT INTO tabs ( - artist_scrape_id - , id, song_id, song_name, artist_id, artist_name, type, part - , version, votes, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name - , version_description, verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name - ) VALUES ( - ?1 - , ?2, ?3, ?4, ?5, ?6, ?7, ?8 - , ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17 - , ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25 - ) - `); - - function addArtist(id, name, tabscount, artist_url, tabs_last_update_timestamp) { - return stmtAddArtist.run([id, name, tabscount, artist_url, tabs_last_update_timestamp]); - } - - function addTab( - artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, - rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, - verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name - ) { - return stmtAddTab.run([ - artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, - rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, - verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name - ]); - } - - for (let file of files) { - if (!file.endsWith('.json')) continue; // skip the .keep file - console.log('reading ./input/' + file); - let dataJSON = await fs.readFile('./input/' + file); - let data = JSON.parse(dataJSON); - let artistIndex = 0; - for (let artist of data) { - console.log(`adding artist (${artistIndex+1}/${data.length}, ${artist.tabs.length} tabs): ${artist.name}`) - let artistResult = await addArtist(artist.id, artist.name, artist.tabscount, artist.artist_url, artist.tabs_last_update_timestamp); - let artistScrapeId = artistResult.lastID; - for (let tab of artist.tabs) { - addTab( - artistScrapeId, - tab.id, tab.song_id, tab.song_name, tab.artist_id, tab.artist_name, tab.type, tab.part, tab.version, tab.votes, - tab.rating, tab.date, tab.status, tab.preset_id, tab.tab_access_type, tab.tp_version, tab.tonality_name, tab.version_description, - tab.verified, tab.artist_url, tab.tab_url, null, tab.difficulty, tab.tuning, tab.type_name - ); - } - artistIndex += 1; - } - } - - await stmtAddTab.finalize(); - await stmtAddArtist.finalize(); - - await db.close(); -})(); +// Injests from ./input/* into ./output/tabs.db + +const sqlite3 = require('sqlite3'); +const sqlite = require('sqlite'); + +const fs = require('fs/promises'); + +(async () => { + const db = await sqlite.open({ + driver: sqlite3.Database, + filename: './output/tabs-no-text.db' + }); + + await db.run(` + CREATE TABLE IF NOT EXISTS artists ( + scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT + , id INTEGER + , name TEXT + , tabscount INTEGER + , artist_url TEXT + , tabs_last_update_timestamp INTEGER + ) + `); + + await db.run(` + CREATE TABLE IF NOT EXISTS tabs ( + scrape_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT + , artist_scrape_id INTEGER NOT NULL + , id INTEGER + , song_id INTEGER + , song_name TEXT + , artist_id INTEGER + , artist_name INTEGER + , type TEXT + , part TEXT + , version INTEGER + , votes INTEGER + , rating NUMERIC + , date TEXT + , status TEXT + , preset_id INTEGER + , tab_access_type TEXT + , tp_version INTEGER + , tonality_name TEXT + , version_description TEXT + , verified INTEGER + , artist_url TEXT + , tab_url TEXT + , tab_text TEXT + , difficulty TEXT + , tuning TEXT + , type_name TEXT + , FOREIGN KEY (artist_scrape_id) REFERENCES artists(scrape_id) + ) + `); + + // Clear out the database + await db.run('DELETE FROM tabs'); + await db.run('DELETE FROM artists'); + + const files = await fs.readdir('./input/'); + + const stmtAddArtist = await db.prepare(` + INSERT INTO artists ( + id, name, tabscount, artist_url, tabs_last_update_timestamp + ) VALUES ( + ?1, ?2, ?3, ?4, ?5 + ) + `); + const stmtAddTab = await db.prepare(` + INSERT INTO tabs ( + artist_scrape_id + , id, song_id, song_name, artist_id, artist_name, type, part + , version, votes, rating, date, status, preset_id, tab_access_type, tp_version, tonality_name + , version_description, verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name + ) VALUES ( + ?1 + , ?2, ?3, ?4, ?5, ?6, ?7, ?8 + , ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17 + , ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25 + ) + `); + + function addArtist(id, name, tabscount, artist_url, tabs_last_update_timestamp) { + return stmtAddArtist.run([id, name, tabscount, artist_url, tabs_last_update_timestamp]); + } + + function addTab( + artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, + rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, + verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name + ) { + return stmtAddTab.run([ + artist_scrape_id, id, song_id, song_name, artist_id, artist_name, type, part, version, votes, + rating, date, status, preset_id, tab_access_type, tp_version, tonality_name, version_description, + verified, artist_url, tab_url, tab_text, difficulty, tuning, type_name + ]); + } + + for (let file of files) { + if (!file.endsWith('.json')) continue; // skip the .keep file + console.log('reading ./input/' + file); + let dataJSON = await fs.readFile('./input/' + file); + let data = JSON.parse(dataJSON); + let artistIndex = 0; + for (let artist of data) { + console.log(`adding artist (${artistIndex+1}/${data.length}, ${artist.tabs.length} tabs): ${artist.name}`) + let artistResult = await addArtist(artist.id, artist.name, artist.tabscount, artist.artist_url, artist.tabs_last_update_timestamp); + let artistScrapeId = artistResult.lastID; + for (let tab of artist.tabs) { + addTab( + artistScrapeId, + tab.id, tab.song_id, tab.song_name, tab.artist_id, tab.artist_name, tab.type, tab.part, tab.version, tab.votes, + tab.rating, tab.date, tab.status, tab.preset_id, tab.tab_access_type, tab.tp_version, tab.tonality_name, tab.version_description, + tab.verified, tab.artist_url, tab.tab_url, null, tab.difficulty, tab.tuning, tab.type_name + ); + } + artistIndex += 1; + } + } + + await stmtAddTab.finalize(); + await stmtAddArtist.finalize(); + + await db.close(); +})(); diff --git a/04-scraper-tabs/01-scrape-tabs.js b/04-scraper-tabs/01-scrape-tabs.js index 4ebb8cb..4c699cc 100644 --- a/04-scraper-tabs/01-scrape-tabs.js +++ b/04-scraper-tabs/01-scrape-tabs.js @@ -1,241 +1,241 @@ -// Scrapes tabs from ultimate-guitar.com to complete the database - -// node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily -const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); -const jsdom = require('jsdom'); - -const sqlite3 = require('sqlite3'); -const sqlite = require('sqlite'); - -const ConcurrentQueue = require('./concurrent-queue.js'); - -class ScrapeError extends Error { - constructor(message, options, fileName, lineNumber) { - super(...arguments); - this.name = 'ScrapeError'; - } -} - -function estMSRemaining(startTime, ratioComplete) { - return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); -} - -function formatRelative(msRelative) { - if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; - else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; - else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; - else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; -} - -function sleep(ms) { - return new Promise((resolve) => { - setTimeout(resolve, ms); - }); -} - -// modified from background-script to use jsdom -async function scrapeUGDataContent(url) { - let page = null; - try { - page = await fetch(url); - } catch (e) { - throw new ScrapeError('Unable to fetch url', { cause: e }); - } - - let text = null; - try { - text = await page.text(); - } catch (e) { - throw new ScrapeError('Unable to decode page', { cause: e }); - } - - let dom = null; - try { - dom = new jsdom.JSDOM(text); - } catch (e) { - throw new ScrapeError('Unable to parse document', { cause: e }); - } - - if (!dom.window || !dom.window.document) { - throw new ScrapeError('Unable to parse document'); - } - - let document = dom.window.document; - - const jsStore = document.querySelector('.js-store'); - if (jsStore == null) { - throw new ScrapeError('Unable to find .js-store element') - } - - const contentJSON = jsStore.getAttribute('data-content'); - if (contentJSON == null) { - throw new ScrapeError('Unable to find data-content attribute on .js-store'); - } - - const content = JSON.parse(contentJSON); - return content; -} - -function parseGeneralTab(ugDataContent) { - const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); - const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); - const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); - const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); - const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); - const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); - const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); - return { meta, text }; -} - -// Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs -async function fetchGeneralTab(url) { - let ugDataContent = await scrapeUGDataContent(url); - return parseGeneralTab(ugDataContent); -} - -(async () => { - - const db = await sqlite.open({ - driver: sqlite3.Database, - filename: './input/tabs.db' - }); - - // Progress queries - - let totalFetchable = (await db.get(` - SELECT - COUNT(*) AS c - FROM - tabs - WHERE - tab_url IS NOT NULL - AND type_name IS NOT NULL - AND ( - type_name='Bass' - OR type_name='Chords' - OR type_name='Drums' - OR type_name='Tab' - OR type_name='Ukulele' - ) - `)).c; - - console.log(`${totalFetchable} Total Fetchable Tabs`) - - let completedFetchable = (await db.get(` - SELECT - COUNT(*) AS c - FROM - tabs - WHERE - tab_text IS NOT NULL - AND tab_url IS NOT NULL - AND type_name IS NOT NULL - AND ( - type_name='Bass' - OR type_name='Chords' - OR type_name='Drums' - OR type_name='Tab' - OR type_name='Ukulele' - ) - `)).c; - - console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`); - - let stmtUpdateTab = await db.prepare(` - UPDATE - tabs - SET - user_id=?1 - , user_iq=?2 - , username=?3 - , tab_text=?4 - WHERE - scrape_id=?5 - `); - - // nigger :) - - let remainingFetchable = totalFetchable - completedFetchable; - let sessionCompleted = 0; - let startTime = Date.now(); - let badUrls = new Set(); - while (true) { - let queryStartTime = Date.now(); - let result = await db.all(` - SELECT - scrape_id - , tab_url - FROM - tabs - WHERE - tab_text IS NULL - AND tab_url IS NOT NULL - AND type_name IS NOT NULL - AND ( - type_name='Bass' - OR type_name='Chords' - OR type_name='Drums' - OR type_name='Tab' - OR type_name='Ukulele' - ) - ORDER BY - bucket - LIMIT 300 - `); - console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`); - // console.log('Sleeping for 10s'); - // await sleep(10000); - - if (result.length === 0) break; - - let batchCompleted = 0; - let queue = new ConcurrentQueue(5); - for (let tabInfo of result) { - if (badUrls.has(tabInfo.tab_url)) continue; - (async () => { - try { - await queue.push(async () => { - let { meta, text } = await fetchGeneralTab(tabInfo.tab_url); - let user_id = meta.user_id; - let user_iq = meta.user_iq; - let username = meta.username; - let tab_text = text; - await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]); - batchCompleted += 1; - if (batchCompleted % (Math.floor(result.length / 10)) === 0) { - console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`); - } - }); - } catch (e) { - console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message); - badUrls.add(tabInfo.tab_url); - } - })(); - await sleep(100); - } - await queue.waitForDrain(); - - sessionCompleted += batchCompleted; - - let elapsed = formatRelative(Date.now() - startTime); - let minsElapsed = (Date.now() - startTime) / (60 * 1000); - let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable)); - let pctComplete = (100 * sessionCompleted / remainingFetchable); - let pctPerMin = (pctComplete / minsElapsed); - let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000))); - console.log(''); - console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`); - console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`); - console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`); - console.log(''); - - if (batchCompleted / result.length < .5) { - console.log('We got kicked off at ', new Date().toString()); - break; - } - } - - await stmtUpdateTab.finalize(); - - await db.close(); -})(); +// Scrapes tabs from ultimate-guitar.com to complete the database + +// node-fetch is an asshole that wants to be ESM-only so we have to do special stuff to import it easily +const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); +const jsdom = require('jsdom'); + +const sqlite3 = require('sqlite3'); +const sqlite = require('sqlite'); + +const ConcurrentQueue = require('./concurrent-queue.js'); + +class ScrapeError extends Error { + constructor(message, options, fileName, lineNumber) { + super(...arguments); + this.name = 'ScrapeError'; + } +} + +function estMSRemaining(startTime, ratioComplete) { + return (1 - ratioComplete) * ((Date.now() - startTime) / ratioComplete); +} + +function formatRelative(msRelative) { + if (msRelative < 1000) return `${(msRelative).toFixed(2)}ms`; + else if (msRelative < 60 * 1000) return `${(msRelative / 1000).toFixed(2)}s`; + else if (msRelative < 60 * 60 * 1000) return `${(msRelative / (60 * 1000)).toFixed(2)} mins`; + else return `${(msRelative / (60 * 60 * 1000)).toFixed(2)} hours`; +} + +function sleep(ms) { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} + +// modified from background-script to use jsdom +async function scrapeUGDataContent(url) { + let page = null; + try { + page = await fetch(url); + } catch (e) { + throw new ScrapeError('Unable to fetch url', { cause: e }); + } + + let text = null; + try { + text = await page.text(); + } catch (e) { + throw new ScrapeError('Unable to decode page', { cause: e }); + } + + let dom = null; + try { + dom = new jsdom.JSDOM(text); + } catch (e) { + throw new ScrapeError('Unable to parse document', { cause: e }); + } + + if (!dom.window || !dom.window.document) { + throw new ScrapeError('Unable to parse document'); + } + + let document = dom.window.document; + + const jsStore = document.querySelector('.js-store'); + if (jsStore == null) { + throw new ScrapeError('Unable to find .js-store element') + } + + const contentJSON = jsStore.getAttribute('data-content'); + if (contentJSON == null) { + throw new ScrapeError('Unable to find data-content attribute on .js-store'); + } + + const content = JSON.parse(contentJSON); + return content; +} + +function parseGeneralTab(ugDataContent) { + const store = ugDataContent.store; if (store === null) throw new ScrapeError('Unable to get ugDataContent.store'); + const page = store.page; if (page === null) throw new ScrapeError('Unable to get ugDataContent.store.page'); + const data = page.data; if (data === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data'); + const meta = data.tab; if (meta === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab'); + const tview = data.tab_view; if (tview === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view'); + const wktab = tview.wiki_tab; if (wktab === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab'); + const text = wktab.content; if (text === null) throw new ScrapeError('Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content'); + return { meta, text }; +} + +// Can only fetch Bass, Chords, Drums, Tab, and Ukulele type_name tabs +async function fetchGeneralTab(url) { + let ugDataContent = await scrapeUGDataContent(url); + return parseGeneralTab(ugDataContent); +} + +(async () => { + + const db = await sqlite.open({ + driver: sqlite3.Database, + filename: './input/tabs.db' + }); + + // Progress queries + + let totalFetchable = (await db.get(` + SELECT + COUNT(*) AS c + FROM + tabs + WHERE + tab_url IS NOT NULL + AND type_name IS NOT NULL + AND ( + type_name='Bass' + OR type_name='Chords' + OR type_name='Drums' + OR type_name='Tab' + OR type_name='Ukulele' + ) + `)).c; + + console.log(`${totalFetchable} Total Fetchable Tabs`) + + let completedFetchable = (await db.get(` + SELECT + COUNT(*) AS c + FROM + tabs + WHERE + tab_text IS NOT NULL + AND tab_url IS NOT NULL + AND type_name IS NOT NULL + AND ( + type_name='Bass' + OR type_name='Chords' + OR type_name='Drums' + OR type_name='Tab' + OR type_name='Ukulele' + ) + `)).c; + + console.log(`${completedFetchable} (${(100 * completedFetchable / totalFetchable).toFixed(2)}%) Fetchable Tabs already completed`); + + let stmtUpdateTab = await db.prepare(` + UPDATE + tabs + SET + user_id=?1 + , user_iq=?2 + , username=?3 + , tab_text=?4 + WHERE + scrape_id=?5 + `); + + // nigger :) + + let remainingFetchable = totalFetchable - completedFetchable; + let sessionCompleted = 0; + let startTime = Date.now(); + let badUrls = new Set(); + while (true) { + let queryStartTime = Date.now(); + let result = await db.all(` + SELECT + scrape_id + , tab_url + FROM + tabs + WHERE + tab_text IS NULL + AND tab_url IS NOT NULL + AND type_name IS NOT NULL + AND ( + type_name='Bass' + OR type_name='Chords' + OR type_name='Drums' + OR type_name='Tab' + OR type_name='Ukulele' + ) + ORDER BY + bucket + LIMIT 300 + `); + console.log(`SQLite Query took ${Date.now() - queryStartTime} ms`); + // console.log('Sleeping for 10s'); + // await sleep(10000); + + if (result.length === 0) break; + + let batchCompleted = 0; + let queue = new ConcurrentQueue(5); + for (let tabInfo of result) { + if (badUrls.has(tabInfo.tab_url)) continue; + (async () => { + try { + await queue.push(async () => { + let { meta, text } = await fetchGeneralTab(tabInfo.tab_url); + let user_id = meta.user_id; + let user_iq = meta.user_iq; + let username = meta.username; + let tab_text = text; + await stmtUpdateTab.run([ user_id, user_iq, username, tab_text, tabInfo.scrape_id ]); + batchCompleted += 1; + if (batchCompleted % (Math.floor(result.length / 10)) === 0) { + console.log(`batch completed: ${batchCompleted}/${result.length - badUrls.size}`); + } + }); + } catch (e) { + console.error('Error fetching tab for ', tabInfo.tab_url, '. Error:', e.message); + badUrls.add(tabInfo.tab_url); + } + })(); + await sleep(100); + } + await queue.waitForDrain(); + + sessionCompleted += batchCompleted; + + let elapsed = formatRelative(Date.now() - startTime); + let minsElapsed = (Date.now() - startTime) / (60 * 1000); + let estimatedRemaining = formatRelative(estMSRemaining(startTime, sessionCompleted / remainingFetchable)); + let pctComplete = (100 * sessionCompleted / remainingFetchable); + let pctPerMin = (pctComplete / minsElapsed); + let tabsPerMin = (sessionCompleted / ((Date.now() - startTime) / (60 * 1000))); + console.log(''); + console.log(`${sessionCompleted}/${remainingFetchable} tabs complete (${pctComplete.toFixed(2)}%)`); + console.log(`${tabsPerMin.toFixed(2)} tabs/min (${pctPerMin.toFixed(5)} %/min)`); + console.log(`${elapsed} elapsed (est. ${estimatedRemaining} remaining)`); + console.log(''); + + if (batchCompleted / result.length < .5) { + console.log('We got kicked off at ', new Date().toString()); + break; + } + } + + await stmtUpdateTab.finalize(); + + await db.close(); +})(); diff --git a/04-scraper-tabs/concurrent-queue.js b/04-scraper-tabs/concurrent-queue.js index c1c3abd..09cb51b 100644 --- a/04-scraper-tabs/concurrent-queue.js +++ b/04-scraper-tabs/concurrent-queue.js @@ -1,48 +1,48 @@ -// Runs a limited number of promises at one time -class ConcurrentQueue { - constructor(consecutive) { - this.consecutive = consecutive; - this.queue = []; - this.current = 0; - this.drainListeners = []; - } - - _checkQueue() { - if (this.current == 0 && this.queue.length == 0) { - for (let drainListener of this.drainListeners) { - drainListener(); - } - this.drainListeners = []; - } - while (this.current < this.consecutive && this.queue.length > 0) { - let taskData = this.queue.shift(); - this.current += 1; - (async () => { - try { - taskData.resolve(await taskData.task()); - } catch (e) { - taskData.reject(e); - } - this.current -= 1; - this._checkQueue(); - })(); - } - } - - // returns a promise that can be awaited to get the resolution or rejection of the task's execution - push(task) { - return new Promise((resolve, reject) => { - this.queue.push({ task, resolve, reject }) - this._checkQueue(); - }); - } - - async waitForDrain() { - return new Promise((resolve) => { - this.drainListeners.push(resolve); - this._checkQueue(); - }); - } -} - -module.exports = ConcurrentQueue; +// Runs a limited number of promises at one time +class ConcurrentQueue { + constructor(consecutive) { + this.consecutive = consecutive; + this.queue = []; + this.current = 0; + this.drainListeners = []; + } + + _checkQueue() { + if (this.current == 0 && this.queue.length == 0) { + for (let drainListener of this.drainListeners) { + drainListener(); + } + this.drainListeners = []; + } + while (this.current < this.consecutive && this.queue.length > 0) { + let taskData = this.queue.shift(); + this.current += 1; + (async () => { + try { + taskData.resolve(await taskData.task()); + } catch (e) { + taskData.reject(e); + } + this.current -= 1; + this._checkQueue(); + })(); + } + } + + // returns a promise that can be awaited to get the resolution or rejection of the task's execution + push(task) { + return new Promise((resolve, reject) => { + this.queue.push({ task, resolve, reject }) + this._checkQueue(); + }); + } + + async waitForDrain() { + return new Promise((resolve) => { + this.drainListeners.push(resolve); + this._checkQueue(); + }); + } +} + +module.exports = ConcurrentQueue; diff --git a/06-output-generator/01-output-generator.js b/06-output-generator/01-output-generator.js index 2645f5a..304182e 100644 --- a/06-output-generator/01-output-generator.js +++ b/06-output-generator/01-output-generator.js @@ -1,126 +1,135 @@ -const sqlite3 = require('sqlite3'); -const sqlite = require('sqlite'); - -const fs = require('fs'); -const path = require('path'); - -const fsExtra = require('fs-extra'); - -function sanitizeFileName(name) { - // Windows Version (created for Windows, most likely works cross-platform too given my research) - // Allowed Characters: Extended Unicode Charset (1-255) - // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 - // Reserved Characters: <>:"/\|?* - // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names - - // Illegal File Names (Windows) - if ([ 'CON', 'PRN', 'AUX', 'NUL', - 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', - 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? - name += '_'; - } - // Reserved Characters - name = name.replace(/[<>:\"\/\\|?*]/g, ''); - // Allowed Characters - return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); - - // Much stricter whitelist version - // replace bad characters with '_' - //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); -} - -function getAvailableFileName(dir, name) { - name = sanitizeFileName(name); - let ext = path.extname(name); - let baseName = path.basename(name, ext); - let availableBaseName = baseName; - let tries = 1; - while (fs.existsSync(path.join(dir, availableBaseName + ext))) { - availableBaseName = baseName + '-' + (++tries); - } - return availableBaseName + ext; -} - -(async () => { - // Clear out old output directory - await fsExtra.emptyDir('./output/'); - - const db = await sqlite.open({ - driver: sqlite3.Database, - filename: './input/tabs-full.db' - }); - - console.log('connected to db'); - - let total = (await db.get(` - SELECT COUNT(*) AS c FROM tabs WHERE tab_text IS NOT NULL - `)).c; - - console.log(`${total} total tabs`); - - let soFar = 0; - const totalRows = await db.each(` - SELECT - scrape_id - , id - , song_id - , song_name - , artist_id - , artist_name - , version - , version_description - , votes - , rating - , date - , tonality_name - , verified - , artist_url - , tab_url - , difficulty - , tuning - , type_name - , user_id - , user_iq - , username - , tab_text - FROM tabs - WHERE tab_text IS NOT NULL - `, (err, row) => { - if (err) throw err; - - soFar += 1; - let fileText = -`${row.song_name} [${row.song_id}]: ${row.tab_url} -By ${row.artist_name} [${row.artist_id}]: ${row.artist_url} -Rating: ${row.rating}, Votes: ${row.votes} -Date: ${row.date} -Tonality: ${row.tonality_name} -Difficulty: ${row.difficulty} -Tuning: ${row.tuning} -Type: ${row.type_name} -Tab By: ${row.username} [${row.user_id}] (${row.user_iq} iq) -Verified: ${row.verified} -Version ${row.version} -${row.version_description || ''} - -${row.tab_text} -`; - let typeDir = path.join('output', row.type_name ?? 'null'); - if (!fs.existsSync(typeDir)) { - fs.mkdirSync(typeDir); - } - - let fileDir = sanitizeFileName(row.artist_name + '-' + row.artist_id); - if (!fs.existsSync(path.join(typeDir, fileDir))) { - fs.mkdirSync(path.join(typeDir, fileDir)); - } - - let fileName = getAvailableFileName(path.join(typeDir, fileDir), row.song_name + '.txt'); - fs.writeFileSync(path.join(typeDir, fileDir, fileName), fileText); - - if (soFar % 100 == 0) { - console.log(`Tab #${soFar}/${total} (${(100 * soFar / total).toFixed(2)}%): ${path.join(typeDir, fileDir, fileName)}`); - } - }); - -})(); +const sqlite3 = require('sqlite3'); +const sqlite = require('sqlite'); + +const fs = require('fs'); +const path = require('path'); + +const fsExtra = require('fs-extra'); + +function sanitizeFileName(name) { + // Windows Version (created for Windows, most likely works cross-platform too given my research) + // Allowed Characters: Extended Unicode Charset (1-255) + // Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9 + // Reserved Characters: <>:"/\|?* + // Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names + + // Illegal File Names (Windows) + if ([ 'CON', 'PRN', 'AUX', 'NUL', + 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', + 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9' ].indexOf(name) != -1) { // TODO: case insensitive? + name += '_'; + } + // Reserved Characters + name = name.replace(/[<>:\"\/\\|?*]/g, ''); + // Allowed Characters + return name.split('').map(c => c.charCodeAt(0) < 255 && c.charCodeAt(0) > 0 ? c : '_').join(''); + + // Much stricter whitelist version + // replace bad characters with '_' + //return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join(''); +} + +function getAvailableFileName(dir, name) { + name = sanitizeFileName(name); + let ext = path.extname(name); + let baseName = path.basename(name, ext); + let availableBaseName = baseName; + let tries = 1; + while (fs.existsSync(path.join(dir, availableBaseName + ext))) { + availableBaseName = baseName + '-' + (++tries); + } + return availableBaseName + ext; +} + +function cleanTab(tab_text) { + return tab_text + .replace(/\[tab\]/g, '') + .replace(/\[\/tab\]/g, '') + .replace(/\[ch]/g, '') + .replace(/\[\/ch\]/g, ''); +} + +(async () => { + // Clear out old output directory + await fsExtra.emptyDir('./output/'); + + const db = await sqlite.open({ + driver: sqlite3.Database, + filename: './input/tabs-full.db' + }); + + console.log('connected to db'); + + let total = (await db.get(` + SELECT COUNT(*) AS c FROM tabs WHERE tab_text IS NOT NULL + `)).c; + + console.log(`${total} total tabs`); + + let soFar = 0; + const totalRows = await db.each(` + SELECT + scrape_id + , id + , song_id + , song_name + , artist_id + , artist_name + , version + , version_description + , votes + , rating + , date + , tonality_name + , verified + , artist_url + , tab_url + , difficulty + , tuning + , type_name + , user_id + , user_iq + , username + , tab_text + FROM tabs + WHERE tab_text IS NOT NULL + ORDER BY rating * votes + votes + `, (err, row) => { + if (err) throw err; + + soFar += 1; + let fileText = +`${row.song_name} [${row.song_id}]: ${row.tab_url} +By ${row.artist_name} [${row.artist_id}]: ${row.artist_url} +Rating: ${row.rating}, Votes: ${row.votes} +Tab By: ${row.username} [${row.user_id}] (${row.user_iq} iq) +Last Edit: ${new Date(row.date * 1000).toLocaleString()}${row.version_description ? '\n------------------------------------------------------------------------\n' + row.version_description : ''} +------------------------------------------------------------------------ +${cleanTab(row.tab_text).trim()/* Remove [bbcode]tags[/bbcode] */} +------------------------------------------------------------------------ +Tonality: ${row.tonality_name} +Difficulty: ${row.difficulty} +Tuning: ${row.tuning} +Type: ${row.type_name} +Verified: ${row.verified} +Version ${row.version} +`; + let typeDir = path.join('output', row.type_name ?? 'null'); + if (!fs.existsSync(typeDir)) { + fs.mkdirSync(typeDir); + } + + let fileDir = sanitizeFileName(row.artist_name + '-' + row.artist_id); + if (!fs.existsSync(path.join(typeDir, fileDir))) { + fs.mkdirSync(path.join(typeDir, fileDir)); + } + + let fileName = getAvailableFileName(path.join(typeDir, fileDir), row.song_name + '.txt'); + fs.writeFileSync(path.join(typeDir, fileDir, fileName), fileText); + + if (soFar % 100 == 0) { + console.log(`Tab #${soFar}/${total} (${(100 * soFar / total).toFixed(2)}%): ${path.join(typeDir, fileDir, fileName)}`); + } + }); + +})(); diff --git a/README.md b/README.md index 84b6768..427852f 100644 --- a/README.md +++ b/README.md @@ -1,274 +1,274 @@ -# ultimate-guitar.com Tab Scraper - -This file set allows for a 6 step process to scrape the tabs off ultimate-guitar.com. - -It takes advantage of a "feature" in the ultimate-guitar.com's rendering techniques that puts website data inside of a div with class '.js-store'. - -Scraping all 1.1 million public tabs from the site is pretty easy and can be done in 6 steps. - -You're going to want a VPN for this because you will get kicked off and IP blocked every 2-8 hours (depending on how agressive you are when scraping). Reconnect to another IP and you'll be good to continue scraping. - -Download the sqlite3 command line client from https://sqlite.org/download.html -Download Node.js from https://nodejs.org/ - -## 1. Scrape Tab URLs - -This step maps out all pages on ultimate-guitar.com that can be scraped - -Enter 01-scraper-urls - -Run -> npm install - -Open up 01-scrape-bands.js and customize the band list links - -Run -> node 01-scrape-bands.js - -This script will save the artist data to output/artists/*.json - -Open up 02-scrape-artist-tab-urls.js and customize the artist file list - -Run -> node 02-scrape-artist-tab-urls.js - -This script will add tab information to the artist data and save it to output/artists-with-tabs/*.json - -## 2. Injest scraped urls into a sqlite database - -This step converts the .json files into a sqlite database to allow scraping to be paused and restarted easily. - -Copy 01-scraper-urls/output/artists-with-tabs/*.json into 02-injest-sqlite/input/ - -Run -> npm install -> node 01-injest-sqlite.js - -Note: this script queues up the artist inserts into the database and then waits for the inserts to finish. Don't be surprised if it hangs for a few hours (it took 6 hours on my SSD). - -This script will create a sqlite database, output/tabs-no-text.db - -## 3. (Optional) Split the sqlite database for parallelized scraping - -This step splits the generated sqlite database into multiple databases so that you can more easily use multiple machines to scrape the site. - -Copy the tabs-no-text.db into 03-splitter-sqlite/ - -Determine the number of machines that you want to run the scraper on. Call this number N - -Open up tabs-no-text.db -> sqlite3 tabs-no-text.db - -Create a view that separates the tabs into buckets -> CREATE VIEW tabs_bucketed AS SELECT *, NTILE(N) OVER (ORDER BY rowid) AS bucket FROM tabs; - -For each machine, i: - -1) Create a new database -> sqlite3 tabs-i-no-text.db - -2) Attach to the base database -> ATTACH 'tabs-no-text.db' AS db2; - -3) Create a table with the rows from the machine's bucket -> CREATE TABLE tabs AS SELECT * FROM db2.tabs_bucketed WHERE bucket=i; - -Make sure to hold onto your tabs-no-text.db database for the merging process. - -## 4. Scrape the tabs - -For each machine, i - -Copy the machine's tabs database into 04-scraper-tabs/input/tabs.db - -Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. - -### 4.1 (Recommended) Index your database - -It's highly recommended to index your database - -Open up 04-scraper-tabs/input/ - -Run -> sqlite3 tabs.db - -Run the following SQL queries -CREATE INDEX IF NOT EXISTS tabs_scrape_id_idx ON tabs (scrape_id); -CREATE INDEX IF NOT EXISTS tabs_tab_url_idx ON tabs (tab_url); -CREATE INDEX IF NOT EXISTS tabs_type_name_idx ON tabs (type_name); - -### 4.2 (Optional) Optimize the tab scraper - -Customize 01-scrape-tabs.js - -Key Lines: - -1) Line 192: let queue = new ConcurrentQueue(5); -- - Increasing this value will increase the number of concurrent requests sent. -- - Note: Higher concurrent request counts result in more agressive scrapes that may run more quickly but also get you kicked off more quickly - -2) Line 214: await sleep(100); -- - Increasing this value (in ms) will increase the delay between the first few concurrent requests. This staggers the requests, potentially reducing the chance you get kicked off. - -3) Line 183: LIMIT 300 -- - Increasing the value in this line will increase the number of tabs scraped from the database at a time before sending a status update. Lower values will query the database more but give more frequent status updates. Higher values will take up more space in process memory and give less frequent status updates. -- - If this value is set too low, removed tab urls will likely fill up the result set, causing the program to incorrectly detect that it got kicked off. - -I found that running 500 tabs/minute gave me a good balance in effort spent reconnecting to VPN and scraping speed. Typically I would have to reset the scraper every 4-6 hours with this rate. - -I got this with concurrency=5 and sleep=100. - -### 4.4 Add required columns for scraping - -Open your tabs.db in sqlite -> sqlite3 tabs.db - -Run the following commands to add the needed columns: - -ALTER TABLE tabs DROP COLUMN tab_text; -ALTER TABLE tabs ADD COLUMN user_id INTEGER; -ALTER TABLE tabs ADD COLUMN user_iq INTEGER; -ALTER TABLE tabs ADD COLUMN username TEXT; -ALTER TABLE tabs ADD COLUMN tab_text TEXT; - -### 4.3 Scrape the tabs - -Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. - -Run -> npm install -> node 01-scrape-tabs.js - -This will add tab information to the tabs.db database. - -Note: This scraper only works for the following tab types: -- Bass -- Chords -- Drums -- Tab -- Ukulele - -The following tab types are not supported: -- Guitar Pro -- Official -- Power -- Video - -## 5. (Optional) Merge the sqlite databases - -Move the partial tabs databases from each machine to 05-merger-sqlite/input/{tabs-i.db} -- Note: i is the machine number from before - -Move the tabs-no-text.db database from step 2 into 05-merger-sqlite/input/. - -Open up tabs-no-text.db -> sqlite3 tabs-no-text.db - -Create an index on tabs.tab_url -> CREATE INDEX tabs_tab_url_idx ON tabs (tab_url); - -Open up the final tabs database, tabs-full.db -> sqlite3 tabs-full.db - -Attach the no-text database -> ATTACH 'tabs-no-text.db' AS 'dbnt'; - -Attach each machine database in the following format: -> ATTACH 'tabs-i.db' AS 'dbi'; - -Create a the final tabs table -CREATE TABLE tabs ( - scrape_id INTEGER - , artist_scrape_id INTEGER NOT NULL - , id INTEGER - , song_id INTEGER - , song_name TEXT - , artist_id INTEGER - , artist_name INTEGER - , type TEXT - , part TEXT - , version INTEGER - , votes INTEGER - , rating NUMERIC - , date TEXT - , status TEXT - , preset_id INTEGER - , tab_access_type TEXT - , tp_version INTEGER - , tonality_name TEXT - , version_description TEXT - , verified INTEGER - , artist_url TEXT - , tab_url TEXT - , difficulty TEXT - , tuning TEXT - , type_name TEXT - , user_id INTEGER - , user_iq INTEGER - , username TEXT - , tab_text TEXT -); - -For each machine, insert its respective tabs into the table -INSERT INTO - tabs -SELECT - tabsnt.scrape_id - , tabsnt.artist_scrape_id - , tabsnt.id - , tabsnt.song_id - , tabsnt.song_name - , tabsnt.artist_id - , tabsnt.artist_name - , tabsnt.type - , tabsnt.part - , tabsnt.version - , tabsnt.votes - , tabsnt.rating - , tabsnt.date - , tabsnt.status - , tabsnt.preset_id - , tabsnt.tab_access_type - , tabsnt.tp_version - , tabsnt.tonality_name - , tabsnt.version_description - , tabsnt.verified - , tabsnt.artist_url - , tabsnt.tab_url - , tabsnt.difficulty - , tabsnt.tuning - , tabsnt.type_name - , tabsm.user_id - , tabsm.user_iq - , tabsm.username - , tabsm.tab_text -FROM - dbnt.tabs AS tabsnt - JOIN dbi.tabs AS tabsm ON tabsnt.tab_url=tabsm.tab_url -WHERE - tabsm.tab_url IS NOT NULL - AND tabsm.tab_text IS NOT NULL; - -Note: this command can take a bit to complete (30s-2m) depending on how large your databases are. - -## 6. Print the contents of the database into organized text files - -Copy your filled tabs database to 06-output-generator/input/tabs-full.db - -Note: Make sure you either rename it in the directory or update 01-output-generator.js:52 with the proper file name - -Run -> npm install -> node --max-old-space-size=16384 01-output-generator.js - -Note: depending on how many tabs you scraped, you may have to increase the max-old-space-size (Max RAM). This example uses 16GB of ram. -- I'm suspicious the memory leak is in the sqlite package >:| - -Congratulations! Your guitar tabs are now organized in: -06-output-generator/output/{type}/{artist}-{artist_id}/{song}.txt - -## Other Information - -You can customize the output generator's file output by modifying the fileText variable in 01-output-gernerator.js:84-99 - -Note: The .keep files can be ignored/deleted. They are kept to keep the default directory structure in git. +# ultimate-guitar.com Tab Scraper + +This file set allows for a 6 step process to scrape the tabs off ultimate-guitar.com. + +It takes advantage of a "feature" in the ultimate-guitar.com's rendering techniques that puts website data inside of a div with class '.js-store'. + +Scraping all 1.1 million public tabs from the site is pretty easy and can be done in 6 steps. + +You're going to want a VPN for this because you will get kicked off and IP blocked every 2-8 hours (depending on how agressive you are when scraping). Reconnect to another IP and you'll be good to continue scraping. + +Download the sqlite3 command line client from https://sqlite.org/download.html +Download Node.js from https://nodejs.org/ + +## 1. Scrape Tab URLs + +This step maps out all pages on ultimate-guitar.com that can be scraped + +Enter 01-scraper-urls + +Run +> npm install + +Open up 01-scrape-bands.js and customize the band list links + +Run +> node 01-scrape-bands.js + +This script will save the artist data to output/artists/*.json + +Open up 02-scrape-artist-tab-urls.js and customize the artist file list + +Run +> node 02-scrape-artist-tab-urls.js + +This script will add tab information to the artist data and save it to output/artists-with-tabs/*.json + +## 2. Injest scraped urls into a sqlite database + +This step converts the .json files into a sqlite database to allow scraping to be paused and restarted easily. + +Copy 01-scraper-urls/output/artists-with-tabs/*.json into 02-injest-sqlite/input/ + +Run +> npm install +> node 01-injest-sqlite.js + +Note: this script queues up the artist inserts into the database and then waits for the inserts to finish. Don't be surprised if it hangs for a few hours (it took 6 hours on my SSD). + +This script will create a sqlite database, output/tabs-no-text.db + +## 3. (Optional) Split the sqlite database for parallelized scraping + +This step splits the generated sqlite database into multiple databases so that you can more easily use multiple machines to scrape the site. + +Copy the tabs-no-text.db into 03-splitter-sqlite/ + +Determine the number of machines that you want to run the scraper on. Call this number N + +Open up tabs-no-text.db +> sqlite3 tabs-no-text.db + +Create a view that separates the tabs into buckets +> CREATE VIEW tabs_bucketed AS SELECT *, NTILE(N) OVER (ORDER BY rowid) AS bucket FROM tabs; + +For each machine, i: + +1) Create a new database +> sqlite3 tabs-i-no-text.db + +2) Attach to the base database +> ATTACH 'tabs-no-text.db' AS db2; + +3) Create a table with the rows from the machine's bucket +> CREATE TABLE tabs AS SELECT * FROM db2.tabs_bucketed WHERE bucket=i; + +Make sure to hold onto your tabs-no-text.db database for the merging process. + +## 4. Scrape the tabs + +For each machine, i + +Copy the machine's tabs database into 04-scraper-tabs/input/tabs.db + +Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. + +### 4.1 (Recommended) Index your database + +It's highly recommended to index your database + +Open up 04-scraper-tabs/input/ + +Run +> sqlite3 tabs.db + +Run the following SQL queries +CREATE INDEX IF NOT EXISTS tabs_scrape_id_idx ON tabs (scrape_id); +CREATE INDEX IF NOT EXISTS tabs_tab_url_idx ON tabs (tab_url); +CREATE INDEX IF NOT EXISTS tabs_type_name_idx ON tabs (type_name); + +### 4.2 (Optional) Optimize the tab scraper + +Customize 01-scrape-tabs.js + +Key Lines: + +1) Line 192: let queue = new ConcurrentQueue(5); +- - Increasing this value will increase the number of concurrent requests sent. +- - Note: Higher concurrent request counts result in more agressive scrapes that may run more quickly but also get you kicked off more quickly + +2) Line 214: await sleep(100); +- - Increasing this value (in ms) will increase the delay between the first few concurrent requests. This staggers the requests, potentially reducing the chance you get kicked off. + +3) Line 183: LIMIT 300 +- - Increasing the value in this line will increase the number of tabs scraped from the database at a time before sending a status update. Lower values will query the database more but give more frequent status updates. Higher values will take up more space in process memory and give less frequent status updates. +- - If this value is set too low, removed tab urls will likely fill up the result set, causing the program to incorrectly detect that it got kicked off. + +I found that running 500 tabs/minute gave me a good balance in effort spent reconnecting to VPN and scraping speed. Typically I would have to reset the scraper every 4-6 hours with this rate. + +I got this with concurrency=5 and sleep=100. + +### 4.4 Add required columns for scraping + +Open your tabs.db in sqlite +> sqlite3 tabs.db + +Run the following commands to add the needed columns: + +ALTER TABLE tabs DROP COLUMN tab_text; +ALTER TABLE tabs ADD COLUMN user_id INTEGER; +ALTER TABLE tabs ADD COLUMN user_iq INTEGER; +ALTER TABLE tabs ADD COLUMN username TEXT; +ALTER TABLE tabs ADD COLUMN tab_text TEXT; + +### 4.3 Scrape the tabs + +Important: Make sure to **change the name** of the database file to **tabs.db** OR the file name in 01-scrape.js:100 - filename: './input/tabs-laptop.db'. + +Run +> npm install +> node 01-scrape-tabs.js + +This will add tab information to the tabs.db database. + +Note: This scraper only works for the following tab types: +- Bass +- Chords +- Drums +- Tab +- Ukulele + +The following tab types are not supported: +- Guitar Pro +- Official +- Power +- Video + +## 5. (Optional) Merge the sqlite databases + +Move the partial tabs databases from each machine to 05-merger-sqlite/input/{tabs-i.db} +- Note: i is the machine number from before + +Move the tabs-no-text.db database from step 2 into 05-merger-sqlite/input/. + +Open up tabs-no-text.db +> sqlite3 tabs-no-text.db + +Create an index on tabs.tab_url +> CREATE INDEX tabs_tab_url_idx ON tabs (tab_url); + +Open up the final tabs database, tabs-full.db +> sqlite3 tabs-full.db + +Attach the no-text database +> ATTACH 'tabs-no-text.db' AS 'dbnt'; + +Attach each machine database in the following format: +> ATTACH 'tabs-i.db' AS 'dbi'; + +Create a the final tabs table +CREATE TABLE tabs ( + scrape_id INTEGER + , artist_scrape_id INTEGER NOT NULL + , id INTEGER + , song_id INTEGER + , song_name TEXT + , artist_id INTEGER + , artist_name INTEGER + , type TEXT + , part TEXT + , version INTEGER + , votes INTEGER + , rating NUMERIC + , date TEXT + , status TEXT + , preset_id INTEGER + , tab_access_type TEXT + , tp_version INTEGER + , tonality_name TEXT + , version_description TEXT + , verified INTEGER + , artist_url TEXT + , tab_url TEXT + , difficulty TEXT + , tuning TEXT + , type_name TEXT + , user_id INTEGER + , user_iq INTEGER + , username TEXT + , tab_text TEXT +); + +For each machine, insert its respective tabs into the table +INSERT INTO + tabs +SELECT + tabsnt.scrape_id + , tabsnt.artist_scrape_id + , tabsnt.id + , tabsnt.song_id + , tabsnt.song_name + , tabsnt.artist_id + , tabsnt.artist_name + , tabsnt.type + , tabsnt.part + , tabsnt.version + , tabsnt.votes + , tabsnt.rating + , tabsnt.date + , tabsnt.status + , tabsnt.preset_id + , tabsnt.tab_access_type + , tabsnt.tp_version + , tabsnt.tonality_name + , tabsnt.version_description + , tabsnt.verified + , tabsnt.artist_url + , tabsnt.tab_url + , tabsnt.difficulty + , tabsnt.tuning + , tabsnt.type_name + , tabsm.user_id + , tabsm.user_iq + , tabsm.username + , tabsm.tab_text +FROM + dbnt.tabs AS tabsnt + JOIN dbi.tabs AS tabsm ON tabsnt.tab_url=tabsm.tab_url +WHERE + tabsm.tab_url IS NOT NULL + AND tabsm.tab_text IS NOT NULL; + +Note: this command can take a bit to complete (30s-2m) depending on how large your databases are. + +## 6. Print the contents of the database into organized text files + +Copy your filled tabs database to 06-output-generator/input/tabs-full.db + +Note: Make sure you either rename it in the directory or update 01-output-generator.js:52 with the proper file name + +Run +> npm install +> node --max-old-space-size=16384 01-output-generator.js + +Note: depending on how many tabs you scraped, you may have to increase the max-old-space-size (Max RAM). This example uses 16GB of ram. +- I'm suspicious the memory leak is in the sqlite package >:| + +Congratulations! Your guitar tabs are now organized in: +06-output-generator/output/{type}/{artist}-{artist_id}/{song}.txt + +## Other Information + +You can customize the output generator's file output by modifying the fileText variable in 01-output-gernerator.js:84-99 + +Note: The .keep files can be ignored/deleted. They are kept to keep the default directory structure in git.