Personal tools
You are here: Home Viz Jones RadioSongScraper Groovy Script

RadioSongScraper Groovy Script

This is the script used to update the database for the Jones sketch.
/*
 * This script scrapes the show summary pages for Marketplace
 * and adds songs played for each episode to the MySQL radio db.
 * It then hits up Yahoo finance for daily DJIA numbers and loads
 * artist similarity data from Last.fm. The result should be a database
 * with information on what songs were played on what day, what artists
 * are similar, and how the market did. This data is plotted by a
 * different piece of software.
 *
 */

import groovy.sql.Sql
import net.roarsoftware.lastfm.Artist
import net.roarsoftware.lastfm.cache.Cache
import java.text.DateFormat
import java.text.SimpleDateFormat

System.setProperty('user.timezone', 'MST')
// create a connection to the db
def db= Sql.newInstance("jdbc:mysql://atomboy.isa-geek.com/radio", "groovy", "blorgus", "com.mysql.jdbc.Driver")
// Last.fm web service API key (see http://www.last.fm/api/intro to get yer own)
String apiKey = 'get yer own'
net.roarsoftware.lastfm.Caller.getInstance().setUserAgent("Graph by acoil (see http://pages.swcp.com/~atomboy/lastfmgraph/ )")
// set up the data folder and point the Last.fm api at it for cache
File dataFolder = new File('./data');
if (!dataFolder.isDirectory()) {
    dataFolder.mkdir()
}
Cache cache = new net.roarsoftware.lastfm.cache.FileSystemCache(dataFolder)
net.roarsoftware.lastfm.Caller.getInstance().setCache(cache)

// get the website's assigned id for marketplace from the db
def showID = db.firstRow("SELECT ShowID FROM tblShows WHERE Name = 'Marketplace'").ShowID

// what's the date for the latest episode in the db?
def dbShowDate = db.firstRow('SELECT MAX(Date) AS maxDate FROM tblEpisodes;').maxDate
// the first date where there's song info on the show summary page
def showDate = new GregorianCalendar(2004, Calendar.NOVEMBER, 01)
if (dbShowDate != null) {
    showDate.setTime(new Date(dbShowDate.getTime()))
    println "Latest show in db is ${String.format('%tm-%<td-%<tY', showDate)}."
    // we want to start on the day AFTER the latest day we've got already
    showDate.add(Calendar.DAY_OF_YEAR, 1)
}
def yesterday = new GregorianCalendar().getInstance()
yesterday.add(Calendar.DAY_OF_YEAR, -1)


// every day between the last day and today
while (showDate.before(yesterday)) {
    def dow = showDate.get(Calendar.DAY_OF_WEEK)
    Boolean pauseAfterFetch = true
    if (dow != Calendar.SATURDAY && dow != Calendar.SUNDAY) { // no Marketplace on weekends
        Integer episodeID = -1
        try {
            db.execute("INSERT INTO tblEpisodes (Date, ShowID) VALUES (?, ?);", [showDate.getTime(), showID])
            episodeID = db.firstRow("SELECT LAST_INSERT_ID() AS id FROM tblEpisodes;").id
        } catch (Exception ex) {
            println ex
            System.exit(1)
        }
        if (episodeID != -1) {
            String sd = String.format('%tm-%<td-%<tY', showDate)
            String address = "http://marketplace.publicradio.org/episodes/show_rundown.php?show_id=${showID}&start_date=${sd}"
            File file = new File(dataFolder, address.replace('?', '_').tokenize("/")[-1])
            if (!file.isFile()) {
                println "reading $address"
                def out = new BufferedOutputStream(new FileOutputStream(file))
                out << new URL(address).openStream()
                out.close()
            } else {
                println "file for $address found in data folder."
                pauseAfterFetch = false
            }
            // read the web page with a simple state machine
            Boolean inMusicSection = false
            Integer songSequence = 1
            def reader = new BufferedReader(new FileReader(file))
            reader.eachLine { line ->
                if (!inMusicSection && line ==~ /\s*<div id="musicFromShow">/) {
                    inMusicSection = true
                } else if (line ==~ /\s*<\/ul>/ ) {
                    inMusicSection = false
                    // we're done with the file here, but break doesn't work inside eachline
                } else if (inMusicSection) {
                    //println line
                    matcher = line =~ /^\s*<li><em class="song">([^<]*)<\/em>([^<]*)<.*$/
                    if (matcher.matches()) {
                        String title = matcher[0][1]
                        title = title.replace('&amp;', '&').trim()
                        if (title.length() > 254) { title = title[0..254] }
                        String artist = matcher[0][2]
                        artist = artist.replace('&amp;', '&').trim()
                        if (artist.length() > 254) { artist = artist[0..254] }
                        println "found '${title}' by ${artist}"
                        db.execute("INSERT INTO tblRadioSongs (EpisodeID, Artist, Title, Sequence) VALUES (?, ?, ?, ?)",
                            [episodeID, artist, title, songSequence])
                        songSequence++
                    }
                }
            }
            //break; //debug - just process one page
            if (pauseAfterFetch) { // to be nice to the web server at publicradio.org
                print "pausing.."
                (1..10).each {
                    print '.'
                    sleep 1000
                }
                println "-"
            }
        } else {
            println "didn't get a good EpisodeID"
        }
    }
    showDate.add(Calendar.DAY_OF_YEAR, 1) // increment by one day
}

// the db should now have an up to date list of songs played
// first, let's update any new song entries where we've previously
// found the artist.

// let's get mbid's for artists that don't have them
db.eachRow("SELECT DISTINCT Artist FROM tblRadioSongs WHERE ArtistID IS NULL AND Artist <> '';") { row ->
    String dbName = row.Artist
    String name = dbName.replaceAll("[*?]", "-")
    print "Getting info for ${name} - "
    def artist = Artist.getInfo(name, apiKey)
    sleep 200 // we're not supposed to make more than 5 requests a second to Last.fm
    if (artist!= null && artist.getMbid().length() == 36) { // a valid mbid will always have 36 chars
        Integer artistID = 0
        def row2 = db.firstRow("SELECT ArtistID FROM tblArtists WHERE mbid = ?", [artist.getMbid(),])
        if (row2 == null) {
            db.execute("INSERT INTO tblArtists (Name, mbid, url) VALUES (?, ?, ?)", [artist.getName(), artist.getMbid(), artist.getUrl()])
            artistID = db.firstRow("SELECT LAST_INSERT_ID() AS ArtistID FROM tblArtists").ArtistID
        } else {
            artistID = row2.ArtistID
        }
        db.execute("UPDATE tblRadioSongs SET ArtistID = ? WHERE Artist = ?", [artistID, dbName])
        println "matched to ${artist.getName()} and updated db."
    } else {
        println "NO MATCH."
        db.execute("UPDATE tblRadioSongs SET ArtistID = 0 WHERE Artist = ?", [name,])
    }
}

// update similarity info
db.eachRow("""SELECT DISTINCT a.ArtistID, a.Name
 FROM tblArtists AS a
 JOIN tblRadioSongs AS r ON r.ArtistID = a.ArtistID
 WHERE (a.DateGotSimilar IS NULL
 OR a.DateGotSimilar < DATE_SUB(CURDATE(),INTERVAL 180 DAY))
 AND r.ArtistID IS NOT NULL;""") { row ->
    Integer artistIDFrom = row.ArtistID
    print "Getting similar to ${row.Name}."
    db.execute("DELETE FROM tblArtistSimilarities WHERE ArtistIDFrom = ?", [artistIDFrom,])
    def artists = Artist.getSimilar(row.Name.replaceAll("[*?]", "-"), apiKey)
    sleep 200
    for (Artist simArtist: artists) {
        // we're checking for a valid mbid because we just don't want to deal with
        // matching band names. If we were after every last match, it might be worth
        // dealing with the many artists that don't have valid mbids and trying to
        // match their names with tblRadioSongs.Artist WHERE ArtistID = 0
        // I think this would produce more bogus matches than it's worth
        if (simArtist.getMbid() != null && simArtist.getMbid().length() == 36) {
            def row2 = db.firstRow("SELECT ArtistID FROM tblArtists WHERE mbid = ?", [simArtist.getMbid(),])
            Integer artistIDTo = 0
            if (row2 == null) {
                db.execute("INSERT INTO tblArtists (Name, mbid, url) VALUES (?, ?, ?)",
                [simArtist.getName(), simArtist.getMbid(), simArtist.getUrl()])
                artistIDTo = db.firstRow("SELECT LAST_INSERT_ID() AS ArtistID FROM tblArtists").ArtistID
            } else {
                artistIDTo = row2.ArtistID
            }
            try {
                db.execute("INSERT INTO tblArtistSimilarities (ArtistIDFrom, ArtistIDTo, Score) VALUES (?, ?, ?)",
                [artistIDFrom, artistIDTo, simArtist.getSimilarityMatch()])
                print "."
            } catch (com.mysql.jdbc.exceptions.jdbc4.MySQLIntegrityConstraintViolationException ex) {
                print "%"
            }
        } else { // no valid mbid
            print "o"
        }
    }
    db.execute("UPDATE tblArtists SET DateGotSimilar = CURDATE() WHERE ArtistID = ?", [artistIDFrom,])
    println "-"
}

// update the djia table with the latest from yahoo finance
def maxDJIADate = db.firstRow("SELECT MAX(Date) AS maxDate FROM djia").maxDate
def fromDate = new GregorianCalendar(2004, Calendar.NOVEMBER, 01)
if (maxDJIADate != null) {
    fromDate.setTime(new Date(maxDJIADate.getTime()))
    fromDate.add(Calendar.DAY_OF_YEAR, 1)
}
File tableFile = new File(dataFolder, "table.csv")
if (!tableFile.exists() && fromDate.before(yesterday)) {
    String djiaUrl = "http://ichart.finance.yahoo.com/table.csv?s=^DJI&a=${fromDate.get(Calendar.MONTH)}&b=${fromDate.get(Calendar.DATE)}&c=${fromDate.get(Calendar.YEAR)}&d=${yesterday.get(Calendar.MONTH)}&e=${yesterday.get(Calendar.DATE)}&f=${yesterday.get(Calendar.YEAR)}&g=d&ignore=.csv"
    println "Getting ${djiaUrl}"
    def out = new BufferedOutputStream(new FileOutputStream(tableFile))
    out << new URL(djiaUrl).openStream()
    out.close()
}
if (fromDate.before(yesterday)) {
    def reader = new BufferedReader(new FileReader(tableFile))
    reader.eachLine { line ->
        //Date,Open,High,Low,Close,Volume,Adj Close
        if (line != "Date,Open,High,Low,Close,Volume,Adj Close") {
            def tokens = line.split(",")
            def params = []
            params << new SimpleDateFormat('yyyy-MM-dd').parse(tokens[0])
            params << Double.parseDouble(tokens[1])
            params << Double.parseDouble(tokens[2])
            params << Double.parseDouble(tokens[3])
            params << Double.parseDouble(tokens[4])
            params << new BigInteger(tokens[5])
            params << Double.parseDouble(tokens[6])
            try {
                db.execute("INSERT INTO djia (Date,Open,High,Low,Close,Volume,AdjClose) VALUES (?,?,?,?,?,?,?)", params)
                print '.'
            } catch (com.mysql.jdbc.exceptions.jdbc4.MySQLIntegrityConstraintViolationException ex) {
                print 'o'
            }
        }
    }
    println "done!"
    reader.close()
    tableFile.delete()
}

Document Actions