2022-08-02 00:03:29 +00:00
const fs = require ( 'fs/promises' ) ;
const path = require ( 'path' ) ;
const jsdom = require ( 'jsdom' ) ;
const fetch = ( ... args ) => import ( 'node-fetch' ) . then ( ( { default : fetch } ) => fetch ( ... args ) ) ;
const ConcurrentQueue = require ( './concurrent-queue.js' ) ;
class ScrapeError extends Error {
constructor ( message , options , fileName , lineNumber ) {
super ( ... arguments ) ;
this . name = 'ScrapeError' ;
}
}
// From Cordis util.js
function sanitizeFileName ( name ) {
// Windows Version (created for Windows, most likely works cross-platform too given my research)
// Allowed Characters: Extended Unicode Charset (1-255)
// Illegal file names: CON, PRN, AUX, NUL, COM1, COM2, ..., COM9, LPT1, LPT2, ..., LPT9
// Reserved Characters: <>:"/\|?*
// Solution: Replace reserved characters with empty string (''), bad characters with '_', and append '_' to bad names
// Illegal File Names (Windows)
if ( [ 'CON' , 'PRN' , 'AUX' , 'NUL' ,
'COM1' , 'COM2' , 'COM3' , 'COM4' , 'COM5' , 'COM6' , 'COM7' , 'COM8' , 'COM9' ,
'LPT1' , 'LPT2' , 'LPT3' , 'LPT4' , 'LPT5' , 'LPT6' , 'LPT7' , 'LPT8' , 'LPT9' ] . indexOf ( name ) != - 1 ) { // TODO: case insensitive?
name += '_' ;
}
// Reserved Characters
name = name . replace ( /[<>:\"\/\\|?*]/g , '' ) ;
// Allowed Characters
return name . split ( '' ) . map ( c => c . charCodeAt ( 0 ) < 255 && c . charCodeAt ( 0 ) > 0 ? c : '_' ) . join ( '' ) ;
// Much stricter whitelist version
// replace bad characters with '_'
//return name.split('').map(c => /[A-Za-z0-9-]/.exec(c) ? c : '_').join('');
}
function estMSRemaining ( startTime , ratioComplete ) {
return ( 1 - ratioComplete ) * ( ( Date . now ( ) - startTime ) / ratioComplete ) ;
}
function formatRelative ( msRelative ) {
if ( msRelative < 1000 ) return ` ${ ( msRelative ) . toFixed ( 2 ) } ms ` ;
else if ( msRelative < 60 * 1000 ) return ` ${ ( msRelative / 1000 ) . toFixed ( 2 ) } s ` ;
else if ( msRelative < 60 * 60 * 1000 ) return ` ${ ( msRelative / ( 60 * 1000 ) ) . toFixed ( 2 ) } mins ` ;
else return ` ${ ( msRelative / ( 60 * 60 * 1000 ) ) . toFixed ( 2 ) } hours ` ;
}
async function sleep ( ms ) {
return new Promise ( ( resolve ) => {
setTimeout ( resolve , ms ) ;
} ) ;
}
async function fuzzyDelay ( ) {
await sleep ( 500 + ( 500 * Math . random ( ) ) ) ;
}
async function saveJsonData ( filename , dataJSON ) {
await fs . writeFile ( filename , JSON . stringify ( dataJSON ) ) ;
}
// Note: This is the key scraper function. It scrapes the .js-store's data
async function scrapeUGDataContent ( url ) {
let page = null ;
try {
page = await fetch ( url ) ;
} catch ( e ) {
throw new ScrapeError ( 'Unable to fetch url' , { cause : e } ) ;
}
let text = null ;
try {
text = await page . text ( ) ;
} catch ( e ) {
throw new ScrapeError ( 'Unable to decode page' , { cause : e } ) ;
}
let dom = null ;
try {
dom = new jsdom . JSDOM ( text ) ;
} catch ( e ) {
throw new ScrapeError ( 'Unable to parse document' , { cause : e } ) ;
}
if ( ! dom . window || ! dom . window . document ) {
throw new ScrapeError ( 'Unable to parse document' ) ;
}
let document = dom . window . document ;
const jsStore = document . querySelector ( '.js-store' ) ;
if ( jsStore == null ) {
throw new ScrapeError ( 'Unable to find .js-store element for ' + url ) ;
}
const contentJSON = jsStore . getAttribute ( 'data-content' ) ;
if ( contentJSON == null ) {
throw new ScrapeError ( 'Unable to find data-content attribute on .js-store' ) ;
}
const content = JSON . parse ( contentJSON ) ;
return content ;
}
function parseTab ( ugDataContent ) {
const store = ugDataContent . store ; if ( store === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store' ) ;
const page = store . page ; if ( page === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page' ) ;
const data = page . data ; if ( data === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data' ) ;
const meta = data . tab ; if ( meta === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.tab' ) ;
const tview = data . tab _view ; if ( tview === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.tab_view' ) ;
const wktab = tview . wiki _tab ; if ( wktab === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.tab_view.wiki_tab' ) ;
const text = wktab . content ; if ( text === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.tab_view.wiki_tab.content' ) ;
return { meta , text } ;
}
function parseBandsPage ( ugDataContent ) {
const store = ugDataContent . store ; if ( store === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store' ) ;
const page = store . page ; if ( page === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page' ) ;
const data = page . data ; if ( data === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data' ) ;
const alpha = data . alpha ; if ( alpha === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.alpha' ) ;
const artists = data . artists ; if ( artists === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.artists' ) ;
const pagenum = data . current _page ; if ( pagenum === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.current_page' ) ;
const pagecnt = data . page _count ; if ( pagecnt === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.page_count' ) ;
return { alpha , artists , pagenum , pagecnt } ;
}
function parseArtistPage ( ugDataContent ) {
const store = ugDataContent . store ; if ( store === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store' ) ;
const page = store . page ; if ( page === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page' ) ;
const data = page . data ; if ( data === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data' ) ;
const pagination = data . pagination ; if ( pagination === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.pagination' ) ;
const pagenum = pagination . current ; if ( pagenum === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.pagination.current' ) ;
const pages = pagination . pages ; if ( pages === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.pagination.pages' ) ;
const albumTabs = data . album _tabs ; if ( albumTabs === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.album_tabs' ) ;
const chordProTabs = data . chord _pro _tabs ; if ( chordProTabs === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.chord_pro_tabs' ) ;
const featTabs = data . feat _tabs ; if ( chordProTabs === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.feat_tabs' ) ;
const otherTabs = data . other _tabs ; if ( chordProTabs === null ) throw new ScrapeError ( 'Unable to get ugDataContent.store.page.data.other_tabs' ) ;
return { albumTabs , chordProTabs , featTabs , otherTabs , pagenum , pages } ;
}
// Returns a list of tab metadata (including tab URL)
async function scrapeAllArtistTabListPages ( startURL ) {
let tabs = [ ] ;
let url = new URL ( startURL ) ; // Note: not considering the <base> tag, would have to change the implementation if this gets used somewhere.
while ( true ) {
//console.log('scraping artist page: ' + url.toString());
const ugDataContent = await scrapeUGDataContent ( url . toString ( ) ) ;
const page = parseArtistPage ( ugDataContent ) ;
tabs = tabs . concat ( page . albumTabs , page . chordProTabs , page . featTabs , page . otherTabs ) ;
const nextPageData = page . pages . find ( pageData => pageData . page == page . pagenum + 1 ) ;
if ( nextPageData == null ) break ;
url = new URL ( nextPageData . url , url ) ;
await fuzzyDelay ( ) ;
}
// the autists at ug.com thought it would be a good idea to return the same tab (same id) on different pages. This filters out duplicates
const uniqueTabIds = new Set ( ) ;
const uniqueTabs = [ ] ;
for ( let tab of tabs ) {
if ( uniqueTabIds . has ( tab . id ) ) continue ;
uniqueTabIds . add ( tab . id ) ;
uniqueTabs . push ( tab ) ;
}
return uniqueTabs ;
}
// Returns a list of artist metadata (including artist tab list URL)
async function scrapeAllBandListPages ( startURL ) {
let artists = [ ] ;
// https://www.ultimate-guitar.com/bands/d.htm
let url = new URL ( startURL ) ;
let startTime = Date . now ( ) ;
while ( true ) {
const ugDataContent = await scrapeUGDataContent ( url . toString ( ) ) ;
const page = parseBandsPage ( ugDataContent ) ;
artists = artists . concat ( page . artists ) ;
let fromNow = formatRelative ( estMSRemaining ( startTime , page . pagenum / page . pagecnt ) ) ;
console . log ( ` Band List Status: ${ page . pagenum } / ${ page . pagecnt } pages complete ( ${ ( page . pagenum / page . pagecnt * 100 ) . toFixed ( 2 ) } %, ${ fromNow } remaining) ` ) ;
if ( page . pagenum + 1 > page . pagecnt ) break ;
url = new URL ( startURL . slice ( 0 , - 4 ) + ( page . pagenum + 1 ) + '.htm' ) ; // d.htm (start) -> d2.htm -> d3.htm -> ...
await fuzzyDelay ( ) ;
}
return artists ;
}
async function saveBandList ( filename , url ) {
let artists = await scrapeAllBandListPages ( url ) ;
await saveJsonData ( filename , artists ) ;
}
// nigger :)
async function saveBandLists ( urls ) {
let startTime = Date . now ( ) ;
let completed = 0 ;
for ( const url of urls ) {
console . log ( 'doing band list: ' + url ) ;
await saveBandList ( path . join ( 'output' , 'artists' , sanitizeFileName ( url + '.json' ) ) , url ) ;
completed += 1 ;
let fromNow = formatRelative ( estMSRemaining ( startTime , completed / urls . length ) ) ;
console . log ( ` Save All Band List Status: ${ completed } / ${ urls . length } band lists complete ( ${ ( completed / urls . length * 100 ) . toFixed ( 2 ) } %, ${ fromNow } remaining) ` ) ;
await fuzzyDelay ( ) ;
}
}
// Note: modifies artists to add a 'tabs' property to each artist. This property contains a list
// of the artist's tab metadatas (tab text is done in a different step)
async function saveArtistsWithTabMetadata ( filename , artists ) {
const baseURL = 'https://www.ultimate-guitar.com/' ;
let startTime = Date . now ( ) ;
let completed = 0 ;
let taskQueue = new ConcurrentQueue ( 8 ) ; // Run a maximum of 4 artist tab list scrapers at a time
// Note: the concurrent queue will (almost certainly) cause the artists to be somewhat to completely out of order in the output
for ( let artist of artists ) {
taskQueue . push ( async ( ) => {
let artistStartURL = new URL ( artist . artist _url , baseURL ) ;
let artistTabs = await scrapeAllArtistTabListPages ( artistStartURL ) ;
artist . tabs = artistTabs ;
completed += 1 ;
let fromNow = formatRelative ( estMSRemaining ( startTime , completed / artists . length ) ) ;
let pctPerMin = ( ( 100 * completed / artists . length ) / ( ( Date . now ( ) - startTime ) / ( 60 * 1000 ) ) ) . toFixed ( 2 ) ;
let artistsPerMin = ( completed / ( ( Date . now ( ) - startTime ) / ( 60 * 1000 ) ) ) . toFixed ( 2 ) ;
console . log ( ` Save Artists with Tab Metadata Status: ${ completed } / ${ artists . length } artists complete ( ${ ( completed / artists . length * 100 ) . toFixed ( 2 ) } %, ${ fromNow } remaining, ${ pctPerMin } %/min, ${ artistsPerMin } artists/min) ` ) ;
} ) ;
}
await taskQueue . waitForDrain ( ) ;
await saveJsonData ( filename , artists ) ;
}
module . exports = {
scrapeBands : saveBandLists ,
scrapeArtistTabUrls : saveArtistsWithTabMetadata
} ;