diff options
author | Chris Gahan <chris@ill-logic.com> | 2006-02-09 18:20:55 +0000 |
---|---|---|
committer | Chris Gahan <chris@ill-logic.com> | 2006-02-09 18:20:55 +0000 |
commit | 5720064dde5ff1205bf072ffe01e7ab070b2152a (patch) | |
tree | 8a003f47697e92bc8d5da4c54aaea483e7745344 /data/rbot | |
parent | 345df89e4d04c89c7cd43e21e918bf0b83bb1205 (diff) |
Changed the way the URL grabber gets urls. Instead of using HEAD, it uses GET, but only grabs the first 50k of the page to check it for a header.
Diffstat (limited to 'data/rbot')
-rw-r--r-- | data/rbot/plugins/url.rb | 91 |
1 files changed, 60 insertions, 31 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 396c5ef2..858b5a05 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -312,6 +312,31 @@ class UrlPlugin < Plugin title = title[0..255] if title.length > 255 "[Link Info] title: #{title}" end +
+ def read_data_from_response(response, amount)
+
+ amount_read = 0
+ chunks = []
+
+ response.read_body do |chunk| # read body now
+
+ amount_read += chunk.length
+
+ if amount_read > amount
+ amount_of_overflow = amount_read - amount
+ chunk = chunk[0...-amount_of_overflow]
+ end
+
+ chunks << chunk
+
+ break if amount_read >= amount
+
+ end
+
+ chunks.join('')
+
+ end
+
def get_title_for_url(uri_str, depth=10) # This god-awful mess is what the ruby http library has reduced me to. @@ -326,37 +351,41 @@ class UrlPlugin < Plugin return if url.scheme !~ /https?/ puts "+ connecting to #{url.host}:#{url.port}" - http = @bot.httputil.get_proxy(url) - title = http.start do |http| - url.path = '/' if url.path == '' - head = http.request_head(url.path) - case head - when Net::HTTPRedirection then - # call self recursively if this is a redirect - redirect_to = head['location'] - puts "+ redirect location: #{redirect_to}" - url = URI.join url.to_s, redirect_to - puts "+ whee, redirecting to #{url.to_s}!" - title = get_title_for_url(url.to_s, depth-1) - when Net::HTTPSuccess then - if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000) - # since the content is 'text/*' and is small enough to - # be a webpage, retrieve the title from the page - puts "+ getting #{url.request_uri}" - response = http.request_get(url.request_uri) - return get_title_from_html(response.body) - else - # content doesn't have title, just display info. - size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') - #lastmod = head['last-modified'] - return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}" - end - when Net::HTTPClientError then - return "[Link Info] Error getting link (#{head.code} - #{head.message})" - when Net::HTTPServerError then - return "[Link Info] Error getting link (#{head.code} - #{head.message})" - end - end + http = @bot.httputil.get_proxy(url) + title = http.start { |http| + url.path = '/' if url.path == ''
+
+ http.request_get(url.path) { |response|
+ + case response + when Net::HTTPRedirection then + # call self recursively if this is a redirect + redirect_to = response['location'] + puts "+ redirect location: #{redirect_to}" + url = URI.join url.to_s, redirect_to + puts "+ whee, redirecting to #{url.to_s}!" + title = get_title_for_url(url.to_s, depth-1) + when Net::HTTPSuccess then + if response['content-type'] =~ /^text\// + # since the content is 'text/*' and is small enough to + # be a webpage, retrieve the title from the page + puts "+ getting #{url.request_uri}"
+ data = read_data_from_response(response, 50000)
+ return get_title_from_html(data) + else + # content doesn't have title, just display info. + size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') + return "[Link Info] type: #{response['content-type']}#{size ? ", size: #{size} bytes" : ""}" + end + when Net::HTTPClientError then + return "[Link Info] Error getting link (#{response.code} - #{response.message})" + when Net::HTTPServerError then + return "[Link Info] Error getting link (#{response.code} - #{response.message})" + end # end of "case response"
+
+ } # end of request block + } # end of http start block
+ rescue SocketError => e return "[Link Info] Error connecting to site (#{e.message})" end |