RadioSongScraper Groovy Script
This is the script used to update the database for the Jones sketch.
/*
* This script scrapes the show summary pages for Marketplace
* and adds songs played for each episode to the MySQL radio db.
* It then hits up Yahoo finance for daily DJIA numbers and loads
* artist similarity data from Last.fm. The result should be a database
* with information on what songs were played on what day, what artists
* are similar, and how the market did. This data is plotted by a
* different piece of software.
*
*/
import groovy.sql.Sql
import net.roarsoftware.lastfm.Artist
import net.roarsoftware.lastfm.cache.Cache
import java.text.DateFormat
import java.text.SimpleDateFormat
System.setProperty('user.timezone', 'MST')
// create a connection to the db
def db= Sql.newInstance("jdbc:mysql://atomboy.isa-geek.com/radio", "groovy", "blorgus", "com.mysql.jdbc.Driver")
// Last.fm web service API key (see http://www.last.fm/api/intro to get yer own)
String apiKey = 'get yer own'
net.roarsoftware.lastfm.Caller.getInstance().setUserAgent("Graph by acoil (see http://pages.swcp.com/~atomboy/lastfmgraph/ )")
// set up the data folder and point the Last.fm api at it for cache
File dataFolder = new File('./data');
if (!dataFolder.isDirectory()) {
dataFolder.mkdir()
}
Cache cache = new net.roarsoftware.lastfm.cache.FileSystemCache(dataFolder)
net.roarsoftware.lastfm.Caller.getInstance().setCache(cache)
// get the website's assigned id for marketplace from the db
def showID = db.firstRow("SELECT ShowID FROM tblShows WHERE Name = 'Marketplace'").ShowID
// what's the date for the latest episode in the db?
def dbShowDate = db.firstRow('SELECT MAX(Date) AS maxDate FROM tblEpisodes;').maxDate
// the first date where there's song info on the show summary page
def showDate = new GregorianCalendar(2004, Calendar.NOVEMBER, 01)
if (dbShowDate != null) {
showDate.setTime(new Date(dbShowDate.getTime()))
println "Latest show in db is ${String.format('%tm-%<td-%<tY', showDate)}."
// we want to start on the day AFTER the latest day we've got already
showDate.add(Calendar.DAY_OF_YEAR, 1)
}
def yesterday = new GregorianCalendar().getInstance()
yesterday.add(Calendar.DAY_OF_YEAR, -1)
// every day between the last day and today
while (showDate.before(yesterday)) {
def dow = showDate.get(Calendar.DAY_OF_WEEK)
Boolean pauseAfterFetch = true
if (dow != Calendar.SATURDAY && dow != Calendar.SUNDAY) { // no Marketplace on weekends
Integer episodeID = -1
try {
db.execute("INSERT INTO tblEpisodes (Date, ShowID) VALUES (?, ?);", [showDate.getTime(), showID])
episodeID = db.firstRow("SELECT LAST_INSERT_ID() AS id FROM tblEpisodes;").id
} catch (Exception ex) {
println ex
System.exit(1)
}
if (episodeID != -1) {
String sd = String.format('%tm-%<td-%<tY', showDate)
String address = "http://marketplace.publicradio.org/episodes/show_rundown.php?show_id=${showID}&start_date=${sd}"
File file = new File(dataFolder, address.replace('?', '_').tokenize("/")[-1])
if (!file.isFile()) {
println "reading $address"
def out = new BufferedOutputStream(new FileOutputStream(file))
out << new URL(address).openStream()
out.close()
} else {
println "file for $address found in data folder."
pauseAfterFetch = false
}
// read the web page with a simple state machine
Boolean inMusicSection = false
Integer songSequence = 1
def reader = new BufferedReader(new FileReader(file))
reader.eachLine { line ->
if (!inMusicSection && line ==~ /\s*<div id="musicFromShow">/) {
inMusicSection = true
} else if (line ==~ /\s*<\/ul>/ ) {
inMusicSection = false
// we're done with the file here, but break doesn't work inside eachline
} else if (inMusicSection) {
//println line
matcher = line =~ /^\s*<li><em class="song">([^<]*)<\/em>([^<]*)<.*$/
if (matcher.matches()) {
String title = matcher[0][1]
title = title.replace('&', '&').trim()
if (title.length() > 254) { title = title[0..254] }
String artist = matcher[0][2]
artist = artist.replace('&', '&').trim()
if (artist.length() > 254) { artist = artist[0..254] }
println "found '${title}' by ${artist}"
db.execute("INSERT INTO tblRadioSongs (EpisodeID, Artist, Title, Sequence) VALUES (?, ?, ?, ?)",
[episodeID, artist, title, songSequence])
songSequence++
}
}
}
//break; //debug - just process one page
if (pauseAfterFetch) { // to be nice to the web server at publicradio.org
print "pausing.."
(1..10).each {
print '.'
sleep 1000
}
println "-"
}
} else {
println "didn't get a good EpisodeID"
}
}
showDate.add(Calendar.DAY_OF_YEAR, 1) // increment by one day
}
// the db should now have an up to date list of songs played
// first, let's update any new song entries where we've previously
// found the artist.
// let's get mbid's for artists that don't have them
db.eachRow("SELECT DISTINCT Artist FROM tblRadioSongs WHERE ArtistID IS NULL AND Artist <> '';") { row ->
String dbName = row.Artist
String name = dbName.replaceAll("[*?]", "-")
print "Getting info for ${name} - "
def artist = Artist.getInfo(name, apiKey)
sleep 200 // we're not supposed to make more than 5 requests a second to Last.fm
if (artist!= null && artist.getMbid().length() == 36) { // a valid mbid will always have 36 chars
Integer artistID = 0
def row2 = db.firstRow("SELECT ArtistID FROM tblArtists WHERE mbid = ?", [artist.getMbid(),])
if (row2 == null) {
db.execute("INSERT INTO tblArtists (Name, mbid, url) VALUES (?, ?, ?)", [artist.getName(), artist.getMbid(), artist.getUrl()])
artistID = db.firstRow("SELECT LAST_INSERT_ID() AS ArtistID FROM tblArtists").ArtistID
} else {
artistID = row2.ArtistID
}
db.execute("UPDATE tblRadioSongs SET ArtistID = ? WHERE Artist = ?", [artistID, dbName])
println "matched to ${artist.getName()} and updated db."
} else {
println "NO MATCH."
db.execute("UPDATE tblRadioSongs SET ArtistID = 0 WHERE Artist = ?", [name,])
}
}
// update similarity info
db.eachRow("""SELECT DISTINCT a.ArtistID, a.Name
FROM tblArtists AS a
JOIN tblRadioSongs AS r ON r.ArtistID = a.ArtistID
WHERE (a.DateGotSimilar IS NULL
OR a.DateGotSimilar < DATE_SUB(CURDATE(),INTERVAL 180 DAY))
AND r.ArtistID IS NOT NULL;""") { row ->
Integer artistIDFrom = row.ArtistID
print "Getting similar to ${row.Name}."
db.execute("DELETE FROM tblArtistSimilarities WHERE ArtistIDFrom = ?", [artistIDFrom,])
def artists = Artist.getSimilar(row.Name.replaceAll("[*?]", "-"), apiKey)
sleep 200
for (Artist simArtist: artists) {
// we're checking for a valid mbid because we just don't want to deal with
// matching band names. If we were after every last match, it might be worth
// dealing with the many artists that don't have valid mbids and trying to
// match their names with tblRadioSongs.Artist WHERE ArtistID = 0
// I think this would produce more bogus matches than it's worth
if (simArtist.getMbid() != null && simArtist.getMbid().length() == 36) {
def row2 = db.firstRow("SELECT ArtistID FROM tblArtists WHERE mbid = ?", [simArtist.getMbid(),])
Integer artistIDTo = 0
if (row2 == null) {
db.execute("INSERT INTO tblArtists (Name, mbid, url) VALUES (?, ?, ?)",
[simArtist.getName(), simArtist.getMbid(), simArtist.getUrl()])
artistIDTo = db.firstRow("SELECT LAST_INSERT_ID() AS ArtistID FROM tblArtists").ArtistID
} else {
artistIDTo = row2.ArtistID
}
try {
db.execute("INSERT INTO tblArtistSimilarities (ArtistIDFrom, ArtistIDTo, Score) VALUES (?, ?, ?)",
[artistIDFrom, artistIDTo, simArtist.getSimilarityMatch()])
print "."
} catch (com.mysql.jdbc.exceptions.jdbc4.MySQLIntegrityConstraintViolationException ex) {
print "%"
}
} else { // no valid mbid
print "o"
}
}
db.execute("UPDATE tblArtists SET DateGotSimilar = CURDATE() WHERE ArtistID = ?", [artistIDFrom,])
println "-"
}
// update the djia table with the latest from yahoo finance
def maxDJIADate = db.firstRow("SELECT MAX(Date) AS maxDate FROM djia").maxDate
def fromDate = new GregorianCalendar(2004, Calendar.NOVEMBER, 01)
if (maxDJIADate != null) {
fromDate.setTime(new Date(maxDJIADate.getTime()))
fromDate.add(Calendar.DAY_OF_YEAR, 1)
}
File tableFile = new File(dataFolder, "table.csv")
if (!tableFile.exists() && fromDate.before(yesterday)) {
String djiaUrl = "http://ichart.finance.yahoo.com/table.csv?s=^DJI&a=${fromDate.get(Calendar.MONTH)}&b=${fromDate.get(Calendar.DATE)}&c=${fromDate.get(Calendar.YEAR)}&d=${yesterday.get(Calendar.MONTH)}&e=${yesterday.get(Calendar.DATE)}&f=${yesterday.get(Calendar.YEAR)}&g=d&ignore=.csv"
println "Getting ${djiaUrl}"
def out = new BufferedOutputStream(new FileOutputStream(tableFile))
out << new URL(djiaUrl).openStream()
out.close()
}
if (fromDate.before(yesterday)) {
def reader = new BufferedReader(new FileReader(tableFile))
reader.eachLine { line ->
//Date,Open,High,Low,Close,Volume,Adj Close
if (line != "Date,Open,High,Low,Close,Volume,Adj Close") {
def tokens = line.split(",")
def params = []
params << new SimpleDateFormat('yyyy-MM-dd').parse(tokens[0])
params << Double.parseDouble(tokens[1])
params << Double.parseDouble(tokens[2])
params << Double.parseDouble(tokens[3])
params << Double.parseDouble(tokens[4])
params << new BigInteger(tokens[5])
params << Double.parseDouble(tokens[6])
try {
db.execute("INSERT INTO djia (Date,Open,High,Low,Close,Volume,AdjClose) VALUES (?,?,?,?,?,?,?)", params)
print '.'
} catch (com.mysql.jdbc.exceptions.jdbc4.MySQLIntegrityConstraintViolationException ex) {
print 'o'
}
}
}
println "done!"
reader.close()
tableFile.delete()
}

