diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2010-10-14 13:18:54 +0200 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2010-10-14 13:48:45 +0200 |
commit | b9cef38a2d8908c35c14fa6d0b30319a7474e593 (patch) | |
tree | e48ff10f5716f50bc751f5e343cff44fd1b74cd0 /data/rbot/plugins | |
parent | efbd4bea3ef18d2a3649dc7a5159e0e910ba7149 (diff) |
imdb: update to latest html
Diffstat (limited to 'data/rbot/plugins')
-rw-r--r-- | data/rbot/plugins/imdb.rb | 43 |
1 files changed, 25 insertions, 18 deletions
diff --git a/data/rbot/plugins/imdb.rb b/data/rbot/plugins/imdb.rb index 7ffd2df8..8fae956a 100644 --- a/data/rbot/plugins/imdb.rb +++ b/data/rbot/plugins/imdb.rb @@ -13,11 +13,12 @@ class Imdb IMDB = "http://www.imdb.com" - TITLE_OR_NAME_MATCH = /<a href="(\/(?:title|name)\/(?:tt|nm)[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ - TITLE_MATCH = /<a href="(\/title\/tt[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ - NAME_MATCH = /<a href="(\/name\/nm[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ - CREDIT_NAME_MATCH = /#{NAME_MATCH}<\/td><td[^>]+> \.\.\. <\/td><td[^>]+>(.+?)<\/td>/ + TITLE_OR_NAME_MATCH = /<a\s+href="(\/(?:title|name)\/(?:tt|nm)[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ + TITLE_MATCH = /<a\s+href="(\/title\/tt[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ + NAME_MATCH = /<a\s+href="(\/name\/nm[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/ + CREDIT_NAME_MATCH = /#{NAME_MATCH}\s*<\/td>\s*<td[^>]+>\s*\.\.\.\s*<\/td>\s*<td[^>]+>\s*(.+?)\s*<\/td>/m FINAL_ARTICLE_MATCH = /, ([A-Z]\S{0,2})$/ + DESC_MATCH = /<meta name="description" content="(.*?)\. (.*?)\. (.*?)\."\s*\/>/ MATCHER = { :title => TITLE_MATCH, @@ -99,7 +100,7 @@ class Imdb end def grab_info(info, body) - /<div (?:id="\S+-info" )?class="info">\s*<h5>#{info}:<\/h5>\s*(.*?)<\/div>/mi.match(body)[1] rescue nil + /<div (?:id="\S+-info" )?class="(?:txt-block|see-more inline canwrap)">\s*<h[45](?: class="inline")?>\s*#{info}:\s*<\/h[45]>\s*(.*?)<\/div>/mi.match(body)[1] rescue nil end def fix_article(org_tit) @@ -134,13 +135,16 @@ class Imdb if resp.code == "200" m = /<title>([^<]*)<\/title>/.match(resp.body) return nil if !m - title_date = m[1] - pre_title, date, extra = title_date.scan(/^(.*)\((\d\d\d\d(?:\/[IV]+)?)\)\s*(.+)?$/).first + title_date = m[1].ircify_html + debug title_date + # note that the date dash for series is a - (ndash), not a - (minus sign) + pre_title, extra, date, junk = title_date.scan(/^(.*)\((.+?\s+)?(\d\d\d\d(?:–(?:\d\d\d\d)?)?(?:\/[IV]+)?)\)\s*(.+)?$/).first + extra.strip! if extra pre_title.strip! - title = fix_article(pre_title.ircify_html) + title = fix_article(pre_title) dir = nil - data = grab_info(/Directors?/, resp.body) + data = grab_info(/(?:Director|Creator)s?/, resp.body) if data dir = data.scan(NAME_MATCH).map { |url, name| name.ircify_html @@ -165,21 +169,24 @@ class Imdb end ratings = "no votes" - m = /<b>([0-9.]+)\/10<\/b>\n?\r?\s+[^<]+<a href="ratings"[^>]+>([0-9,]+) votes?<\/a>/.match(resp.body) + m = resp.body.match(/<b>([0-9.]+)<\/b><span [^>]+>\/10<\/span><\/span>\s*[^<]+<a\s+[^>]*href="ratings"[^>]+>([0-9,]+) votes?<\/a>/m) if m ratings = "#{m[1]}/10 (#{m[2]} voters)" end genre = Array.new - resp.body.scan(/<a href="\/Sections\/Genres\/[^\/]+\/">([^<]+)<\/a>/) do |gnr| + resp.body.scan(/<a href="\/genre\/[^"]+">([^<]+)<\/a>/) do |gnr| genre << gnr end - plot = nil - data = grab_info(/Plot(?: (?:Outline|Summary))?/, resp.body) - if data - plot = "Plot: " + data.ircify_html.gsub(/\s+more\s*$/,'').gsub(/\s+Full summary » \| Full synopsis »\s*$/,'') - end + plot = resp.body.match(DESC_MATCH)[3] rescue nil + # TODO option to extract the long storyline + # data = resp.body.match(/<h2>Storyline<\/h2>\s+/m).post_match.match(/<\/p>/).pre_match rescue nil + # if data + # data.sub!(/<em class="nobr">Written by.*$/m, '') + # plot = data.ircify_html.gsub(/\s+more\s*$/,'').gsub(/\s+Full summary » \| Full synopsis »\s*$/,'') + # end + plot = "Plot: #{plot}" if plot info << ["Ratings: " << ratings, "Genre: " << genre.join('/') , plot].compact.join(". ") @@ -203,9 +210,9 @@ class Imdb if resp.code == "200" m = /<title>([^<]*)<\/title>/.match(resp.body) return nil if !m - name = m[1] + name = m[1].sub(/ - IMDb/, '') - info << "#{name}" + info << name info.last << " : http://www.imdb.com#{sr}" unless opts[:nourl] return info if opts[:name_only] |