summaryrefslogtreecommitdiff
path: root/data
diff options
context:
space:
mode:
authorChris Gahan <chris@ill-logic.com>2006-02-09 18:20:55 +0000
committerChris Gahan <chris@ill-logic.com>2006-02-09 18:20:55 +0000
commit5720064dde5ff1205bf072ffe01e7ab070b2152a (patch)
tree8a003f47697e92bc8d5da4c54aaea483e7745344 /data
parent345df89e4d04c89c7cd43e21e918bf0b83bb1205 (diff)
Changed the way the URL grabber gets urls. Instead of using HEAD, it uses GET, but only grabs the first 50k of the page to check it for a header.
Diffstat (limited to 'data')
-rw-r--r--data/rbot/plugins/url.rb91
1 files changed, 60 insertions, 31 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index 396c5ef2..858b5a05 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -312,6 +312,31 @@ class UrlPlugin < Plugin
title = title[0..255] if title.length > 255
"[Link Info] title: #{title}"
end
+
+ def read_data_from_response(response, amount)
+
+ amount_read = 0
+ chunks = []
+
+ response.read_body do |chunk| # read body now
+
+ amount_read += chunk.length
+
+ if amount_read > amount
+ amount_of_overflow = amount_read - amount
+ chunk = chunk[0...-amount_of_overflow]
+ end
+
+ chunks << chunk
+
+ break if amount_read >= amount
+
+ end
+
+ chunks.join('')
+
+ end
+
def get_title_for_url(uri_str, depth=10)
# This god-awful mess is what the ruby http library has reduced me to.
@@ -326,37 +351,41 @@ class UrlPlugin < Plugin
return if url.scheme !~ /https?/
puts "+ connecting to #{url.host}:#{url.port}"
- http = @bot.httputil.get_proxy(url)
- title = http.start do |http|
- url.path = '/' if url.path == ''
- head = http.request_head(url.path)
- case head
- when Net::HTTPRedirection then
- # call self recursively if this is a redirect
- redirect_to = head['location']
- puts "+ redirect location: #{redirect_to}"
- url = URI.join url.to_s, redirect_to
- puts "+ whee, redirecting to #{url.to_s}!"
- title = get_title_for_url(url.to_s, depth-1)
- when Net::HTTPSuccess then
- if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000)
- # since the content is 'text/*' and is small enough to
- # be a webpage, retrieve the title from the page
- puts "+ getting #{url.request_uri}"
- response = http.request_get(url.request_uri)
- return get_title_from_html(response.body)
- else
- # content doesn't have title, just display info.
- size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
- #lastmod = head['last-modified']
- return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}"
- end
- when Net::HTTPClientError then
- return "[Link Info] Error getting link (#{head.code} - #{head.message})"
- when Net::HTTPServerError then
- return "[Link Info] Error getting link (#{head.code} - #{head.message})"
- end
- end
+ http = @bot.httputil.get_proxy(url)
+ title = http.start { |http|
+ url.path = '/' if url.path == ''
+
+ http.request_get(url.path) { |response|
+
+ case response
+ when Net::HTTPRedirection then
+ # call self recursively if this is a redirect
+ redirect_to = response['location']
+ puts "+ redirect location: #{redirect_to}"
+ url = URI.join url.to_s, redirect_to
+ puts "+ whee, redirecting to #{url.to_s}!"
+ title = get_title_for_url(url.to_s, depth-1)
+ when Net::HTTPSuccess then
+ if response['content-type'] =~ /^text\//
+ # since the content is 'text/*' and is small enough to
+ # be a webpage, retrieve the title from the page
+ puts "+ getting #{url.request_uri}"
+ data = read_data_from_response(response, 50000)
+ return get_title_from_html(data)
+ else
+ # content doesn't have title, just display info.
+ size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
+ return "[Link Info] type: #{response['content-type']}#{size ? ", size: #{size} bytes" : ""}"
+ end
+ when Net::HTTPClientError then
+ return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+ when Net::HTTPServerError then
+ return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+ end # end of "case response"
+
+ } # end of request block
+ } # end of http start block
+
rescue SocketError => e
return "[Link Info] Error connecting to site (#{e.message})"
end