summaryrefslogtreecommitdiff
path: root/lib/rbot/core/utils
diff options
context:
space:
mode:
authorDmitry Kim <dmitry point kim at gmail point com>2007-03-30 23:44:02 +0000
committerDmitry Kim <dmitry point kim at gmail point com>2007-03-30 23:44:02 +0000
commitb11c3c4042b03e36639370002ecf86c44f7ddde4 (patch)
tree05a35024a2d56c7e3d313317376a17cb7c41a99f /lib/rbot/core/utils
parentb73d6c7dc6554e1c6eb6abce68350ed2c13191b8 (diff)
*** (httputil) major rework, new caching implementation, unified request
processing + (httputil) post support, partial request support, other features - (httputil) removed partial_body() and get_cached() [merged into get()] * (plugins/, utils) minimal changes to accomodate for the new http_utils * (utils, ircbot) moved utils initialization into utils.rb * (tube.rb) (partially) accomodate for upstream site layout changes
Diffstat (limited to 'lib/rbot/core/utils')
-rw-r--r--lib/rbot/core/utils/httputil.rb611
-rw-r--r--lib/rbot/core/utils/utils.rb5
2 files changed, 284 insertions, 332 deletions
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index aebd1e81..4ce8dcc3 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -5,10 +5,11 @@
#
# Author:: Tom Gilbert <tom@linuxbrit.co.uk>
# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
+# Author:: Dmitry "jsn" Kim <dmitry point kim at gmail point com>
#
# Copyright:: (C) 2002-2005 Tom Gilbert
# Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta
-# Copyright:: (C) 2006,2007 Giuseppe Bilotta
+# Copyright:: (C) 2007 Giuseppe Bilotta, Dmitry Kim
require 'resolv'
require 'net/http'
@@ -19,25 +20,6 @@ rescue LoadError => e
error "Secured HTTP connections will fail"
end
-module ::Net
- class HTTPResponse
- # Read chunks from the body until we have at least _size_ bytes, yielding
- # the partial text at each chunk. Return the partial body.
- def partial_body(size=0, &block)
-
- partial = String.new
-
- self.read_body { |chunk|
- partial << chunk
- yield partial if block_given?
- break if size and size > 0 and partial.length >= size
- }
-
- return partial
- end
- end
-end
-
Net::HTTP.version_1_2
module ::Irc
@@ -79,16 +61,113 @@ class HttpUtil
:default => 8192,
:desc => "How many bytes to download from a web page to find some information. Set to 0 to let the bot download the whole page.")
+ class CachedObject
+ attr_accessor :response, :last_used, :first_used, :count, :expires, :date
+
+ def self.maybe_new(resp)
+ debug "maybe new #{resp}"
+ return nil unless Net::HTTPOK === resp ||
+ Net::HTTPMovedPermanently === resp ||
+ Net::HTTPFound === resp ||
+ Net::HTTPPartialContent === resp
+
+ cc = resp['cache-control']
+ return nil if cc && (cc =~ /no-cache/i)
+
+ date = Time.now
+ if d = resp['date']
+ date = Time.httpdate(d)
+ end
+
+ return nil if resp['expires'] && (Time.httpdate(resp['expires']) < date)
+
+ debug "creating cache obj"
+
+ self.new(resp)
+ end
+
+ def use
+ now = Time.now
+ @first_used = now if @count == 0
+ @last_used = now
+ @count += 1
+ end
+
+ def expired?
+ debug "checking expired?"
+ if cc = self.response['cache-control'] && cc =~ /must-revalidate/
+ return true
+ end
+ return self.expires < Time.now
+ end
+
+ def setup_headers(hdr)
+ hdr['if-modified-since'] = self.date.rfc2822
+
+ debug "ims == #{hdr['if-modified-since']}"
+
+ if etag = self.response['etag']
+ hdr['if-none-match'] = etag
+ debug "etag: #{etag}"
+ end
+ end
+
+ def revalidate(resp = self.response)
+ @count = 0
+ self.use
+ self.date = resp.key?('date') ? Time.httpdate(resp['date']) : Time.now
+
+ cc = resp['cache-control']
+ if cc && (cc =~ /max-age=(\d+)/)
+ self.expires = self.date + $1.to_i
+ elsif resp.key?('expires')
+ self.expires = Time.httpdate(resp['expires'])
+ elsif lm = resp['last-modified']
+ delta = self.date - Time.httpdate(lm)
+ delta = 10 if delta <= 0
+ delta /= 5
+ self.expires = self.date + delta
+ else
+ self.expires = self.date + 300
+ end
+ # self.expires = Time.now + 10 # DEBUG
+ debug "expires on #{self.expires}"
+
+ return true
+ end
+
+ private
+ def initialize(resp)
+ @response = resp
+ begin
+ self.revalidate
+ self.response.body
+ rescue Exception => e
+ error e.message
+ error e.backtrace.join("\n")
+ raise e
+ end
+ end
+ end
+
def initialize(bot)
@bot = bot
@cache = Hash.new
@headers = {
- 'User-Agent' => "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)",
+ 'Accept-Charset' => 'utf-8;q=1.0, *;q=0.8',
+ 'User-Agent' =>
+ "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)"
+ }
+ debug "starting http cache cleanup timer"
+ @timer = @bot.timer.add(300) {
+ self.remove_stale_cache unless @bot.config['http.no_expire_cache']
}
- @last_response = nil
+ end
+
+ def cleanup
+ debug 'stopping http cache cleanup timer'
+ @bot.timer.remove(@timer)
end
- attr_reader :last_response
- attr_reader :headers
# if http_proxy_include or http_proxy_exclude are set, then examine the
# uri to see if this is a proxied uri
@@ -139,7 +218,13 @@ class HttpUtil
# proxying based on the bot's proxy configuration.
# This will include per-url proxy configuration based on the bot config
# +http_proxy_include/exclude+ options.
- def get_proxy(uri)
+
+ def get_proxy(uri, options = {})
+ opts = {
+ :read_timeout => 10,
+ :open_timeout => 5
+ }.merge(options)
+
proxy = nil
proxy_host = nil
proxy_port = nil
@@ -166,363 +251,227 @@ class HttpUtil
h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port)
h.use_ssl = true if uri.scheme == "https"
+
+ h.read_timeout = opts[:read_timeout]
+ h.open_timeout = opts[:open_timeout]
return h
end
- # uri:: uri to query (Uri object)
- # readtimeout:: timeout for reading the response
- # opentimeout:: timeout for opening the connection
- #
- # simple get request, returns (if possible) response body following redirs
- # and caching if requested
- # if a block is given, it yields the urls it gets redirected to
- # TODO we really need something to implement proper caching
- def get(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"], cache=false)
- if uri_or_str.kind_of?(URI)
- uri = uri_or_str
- else
- uri = URI.parse(uri_or_str.to_s)
- end
- debug "Getting #{uri}"
-
- proxy = get_proxy(uri)
- proxy.open_timeout = opentimeout
- proxy.read_timeout = readtimeout
-
- begin
- proxy.start() {|http|
- yield uri.request_uri() if block_given?
- req = Net::HTTP::Get.new(uri.request_uri(), @headers)
- if uri.user and uri.password
- req.basic_auth(uri.user, uri.password)
- end
- resp = http.request(req)
- case resp
- when Net::HTTPSuccess
- if cache
- debug "Caching #{uri.to_s}"
- cache_response(uri.to_s, resp)
- end
- return resp.body
- when Net::HTTPRedirection
- if resp.key?('location')
- new_loc = URI.join(uri, resp['location'])
- debug "Redirecting #{uri} to #{new_loc}"
- yield new_loc if block_given?
- if max_redir > 0
- # If cache is an Array, we assume get was called by get_cached
- # because of a cache miss and that the first value of the Array
- # was the noexpire value. Since the cache miss might have been
- # caused by a redirection, we want to try get_cached again
- # TODO FIXME look at Python's httplib2 for a most likely
- # better way to handle all this mess
- if cache.kind_of?(Array)
- return get_cached(new_loc, readtimeout, opentimeout, max_redir-1, cache[0])
- else
- return get(new_loc, readtimeout, opentimeout, max_redir-1, cache)
- end
- else
- warning "Max redirection reached, not going to #{new_loc}"
- end
- else
- warning "Unknown HTTP redirection #{resp.inspect}"
- end
- else
- debug "HttpUtil.get return code #{resp.code} #{resp.body}"
+ def handle_response(uri, resp, opts, &block)
+ if Net::HTTPRedirection === resp && opts[:max_redir] >= 0
+ if resp.key?('location')
+ raise 'Too many redirections' if opts[:max_redir] <= 0
+ yield resp if opts[:yield] == :all && block_given?
+ loc = resp['location']
+ new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc)
+ new_opts = opts.dup
+ new_opts[:max_redir] -= 1
+ case opts[:method].to_s.downcase.intern
+ when :post, :"net::http::post"
+ new_opts[:method] = :get
end
- @last_response = resp
- return nil
- }
- rescue StandardError, Timeout::Error => e
- error "HttpUtil.get exception: #{e.inspect}, while trying to get #{uri}"
- debug e.backtrace.join("\n")
+ debug "following the redirect to #{new_loc}"
+ return get_response(new_loc, new_opts, &block)
+ else
+ warning ":| redirect w/o location?"
+ end
end
- @last_response = nil
- return nil
- end
-
- # just like the above, but only gets the head
- def head(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"])
- if uri_or_str.kind_of?(URI)
- uri = uri_or_str
+ if block_given?
+ yield(resp)
else
- uri = URI.parse(uri_or_str.to_s)
+ resp.body
end
- proxy = get_proxy(uri)
- proxy.open_timeout = opentimeout
- proxy.read_timeout = readtimeout
-
- begin
- proxy.start() {|http|
- yield uri.request_uri() if block_given?
- req = Net::HTTP::Head.new(uri.request_uri(), @headers)
- if uri.user and uri.password
- req.basic_auth(uri.user, uri.password)
- end
- resp = http.request(req)
- case resp
- when Net::HTTPSuccess
- return resp
- when Net::HTTPRedirection
- debug "Redirecting #{uri} to #{resp['location']}"
- yield resp['location'] if block_given?
- if max_redir > 0
- return head( URI.parse(resp['location']), readtimeout, opentimeout, max_redir-1)
- else
- warning "Max redirection reached, not going to #{resp['location']}"
- end
- else
- debug "HttpUtil.head return code #{resp.code}"
- end
- @last_response = resp
- return nil
- }
- rescue StandardError, Timeout::Error => e
- error "HttpUtil.head exception: #{e.inspect}, while trying to get #{uri}"
- debug e.backtrace.join("\n")
- end
- @last_response = nil
- return nil
+ return resp
end
# uri:: uri to query (Uri object or String)
# opts:: options. Currently used:
+ # :method:: request method [:get (default), :post or :head]
# :open_timeout:: open timeout for the proxy
# :read_timeout:: read timeout for the proxy
# :cache:: should we cache results?
+ # :yield:: if :final [default], call &block for the response object
+ # if :all, call &block for all intermediate redirects, too
+ # :max_redir:: how many redirects to follow before raising the exception
+ # if -1, don't follow redirects, just return them
+ # :range:: make a ranged request (usually GET). accepts a string
+ # for HTTP/1.1 "Range:" header (i.e. "bytes=0-1000")
+ # :body:: request body (usually for POST requests)
#
- # This method is used to get responses following redirections.
+ # Generic http transaction method
#
- # It will return either a Net::HTTPResponse or an error.
+ # It will return a HTTP::Response object or raise an exception
#
- # If a block is given, it will yield the response or error instead of
- # returning it
- #
- def get_response(uri_or_str, opts={}, &block)
- if uri_or_str.kind_of?(URI)
- uri = uri_or_str
- else
- uri = URI.parse(uri_or_str.to_s)
+ # If a block is given, it will yield the response (see :yield option)
+
+ def get_response(uri_or_s, options = {}, &block)
+ uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s)
+ opts = {
+ :max_redir => @bot.config['http.max_redir'],
+ :yield => :final,
+ :cache => true,
+ :method => :GET
+ }.merge(options)
+
+ resp = nil
+ cached = nil
+
+ req_class = case opts[:method].to_s.downcase.intern
+ when :head, :"net::http::head"
+ opts[:max_redir] = -1
+ Net::HTTP::Head
+ when :get, :"net::http::get"
+ Net::HTTP::Get
+ when :post, :"net::http::post"
+ opts[:cache] = false
+ opts[:body] or raise 'post request w/o a body?'
+ warning "refusing to cache POST request" if options[:cache]
+ Net::HTTP::Post
+ else
+ warning "unsupported method #{opts[:method]}, doing GET"
+ Net::HTTP::Get
+ end
+
+ if req_class != Net::HTTP::Get && opts[:range]
+ warning "can't request ranges for #{req_class}"
+ opts.delete(:range)
end
- debug "Getting #{uri}"
- options = {
- :read_timeout => 10,
- :open_timeout => 5,
- :max_redir => @bot.config["http.max_redir"],
- :cache => false,
- :yield => :none
- }.merge(opts)
-
- cache = options[:cache]
+ cache_key = "#{opts[:range]}|#{req_class}|#{uri.to_s}"
- proxy = get_proxy(uri)
- proxy.open_timeout = options[:open_timeout]
- proxy.read_timeout = options[:read_timeout]
-
- begin
- proxy.start() {|http|
- req = Net::HTTP::Get.new(uri.request_uri(), @headers)
- if uri.user and uri.password
- req.basic_auth(uri.user, uri.password)
- end
- http.request(req) { |resp|
- case resp
- when Net::HTTPSuccess
- if cache
- debug "Caching #{uri.to_s}"
- cache_response(uri.to_s, resp)
- end
- when Net::HTTPRedirection
- if resp.key?('location')
- new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location'])
- debug "Redirecting #{uri} to #{new_loc}"
- if options[:max_redir] > 0
- new_opts = options.dup
- new_opts[:max_redir] -= 1
- return get_response(new_loc, new_opts, &block)
- else
- raise "Too many redirections"
- end
- end
- end
- if block_given?
- yield resp
- else
- return resp
- end
- }
- }
- rescue StandardError, Timeout::Error => e
- error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}"
- debug e.backtrace.join("\n")
- def e.body
- nil
- end
- if block_given?
- yield e
- else
- return e
+ if req_class != Net::HTTP::Get && req_class != Net::HTTP::Head
+ if opts[:cache]
+ warning "can't cache #{req_class.inspect} requests, working w/o cache"
+ opts[:cache] = false
end
end
- raise "This shouldn't happen"
- end
+ debug "get_response(#{uri}, #{opts.inspect})"
- def cache_response(k, resp)
- begin
- if resp.key?('pragma') and resp['pragma'] == 'no-cache'
- debug "Not caching #{k}, it has Pragma: no-cache"
- return
+ if opts[:cache] && cached = @cache[cache_key]
+ debug "got cached"
+ if !cached.expired?
+ debug "using cached"
+ cached.use
+ return handle_response(uri, cached.response, opts, &block)
end
- # TODO should we skip caching if neither last-modified nor etag are present?
- now = Time.new
- u = Hash.new
- u = Hash.new
- u[:body] = resp.body
- u[:last_modified] = nil
- u[:last_modified] = Time.httpdate(resp['date']) if resp.key?('date')
- u[:last_modified] = Time.httpdate(resp['last-modified']) if resp.key?('last-modified')
- u[:expires] = now
- u[:expires] = Time.httpdate(resp['expires']) if resp.key?('expires')
- u[:revalidate] = false
- if resp.key?('cache-control')
- # TODO max-age
- case resp['cache-control']
- when /no-cache|must-revalidate/
- u[:revalidate] = true
- end
+ end
+
+ headers = @headers.dup.merge(opts[:headers] || {})
+ headers['Range'] = opts[:range] if opts[:range]
+
+ cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get)
+ req = req_class.new(uri.request_uri, headers)
+ req.basic_auth(uri.user, uri.password) if uri.user && uri.password
+ req.body = opts[:body] if req_class == Net::HTTP::Post
+ debug "prepared request: #{req.to_hash.inspect}"
+
+ get_proxy(uri, opts).start do |http|
+ http.request(req) do |resp|
+ if Net::HTTPNotModified === resp
+ debug "not modified"
+ begin
+ cached.revalidate(resp)
+ rescue Exception => e
+ error e.message
+ error e.backtrace.join("\n")
+ end
+ debug "reusing cached"
+ resp = cached.response
+ elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
+ debug "http error, deleting cached obj" if cached
+ @cache.delete(cache_key)
+ elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil
+ debug "storing to cache"
+ @cache[cache_key] = cached
+ end
+ return handle_response(uri, resp, opts, &block)
end
- u[:etag] = ""
- u[:etag] = resp['etag'] if resp.key?('etag')
- u[:count] = 1
- u[:first_use] = now
- u[:last_use] = now
- rescue => e
- error "Failed to cache #{k}/#{resp.to_hash.inspect}: #{e.inspect}"
- return
end
- @cache[k] = u
- debug "Cached #{k}/#{resp.to_hash.inspect}: #{u.inspect_no_body}"
- debug "#{@cache.size} pages (#{@cache.keys.join(', ')}) cached up to now"
end
- # For debugging purposes
- class ::Hash
- def inspect_no_body
- temp = self.dup
- temp.delete(:body)
- temp.inspect
+ # uri:: uri to query (Uri object)
+ #
+ # simple get request, returns (if possible) response body following redirs
+ # and caching if requested
+ def get(uri, opts = {}, &block)
+ begin
+ resp = get_response(uri, opts, &block)
+ raise "http error: #{resp}" unless Net::HTTPOK === resp ||
+ Net::HTTPPartialContent === resp
+ return resp.body
+ rescue Exception => e
+ error e.message
+ error e.backtrace.join("\n")
end
+ return nil
end
- def expired?(uri, readtimeout, opentimeout)
- k = uri.to_s
- debug "Checking cache validity for #{k}"
+ def head(uri, options = {}, &block)
+ opts = {:method => :head}.merge(options)
begin
- return true unless @cache.key?(k)
- u = @cache[k]
-
- # TODO we always revalidate for the time being
-
- if u[:etag].empty? and u[:last_modified].nil?
- # TODO max-age
- return true
- end
-
- proxy = get_proxy(uri)
- proxy.open_timeout = opentimeout
- proxy.read_timeout = readtimeout
-
- proxy.start() {|http|
- yield uri.request_uri() if block_given?
- headers = @headers.dup
- headers['If-None-Match'] = u[:etag] unless u[:etag].empty?
- headers['If-Modified-Since'] = u[:last_modified].rfc2822 if u[:last_modified]
- debug "Cache HEAD request headers: #{headers.inspect}"
- # FIXME TODO We might want to use a Get here
- # because if a 200 OK is returned we would get the new body
- # with one connection less ...
- req = Net::HTTP::Head.new(uri.request_uri(), headers)
- if uri.user and uri.password
- req.basic_auth(uri.user, uri.password)
- end
- resp = http.request(req)
- debug "Checking cache validity of #{u.inspect_no_body} against #{resp.inspect}/#{resp.to_hash.inspect}"
- case resp
- when Net::HTTPNotModified
- return false
- else
- return true
- end
- }
- rescue => e
- error "Failed to check cache validity for #{uri}: #{e.inspect}"
- return true
+ resp = get_response(uri, opts, &block)
+ raise "http error #{resp}" if Net::HTTPClientError === resp ||
+ Net::HTTPServerError == resp
+ return resp
+ rescue Exception => e
+ error e.message
+ error e.backtrace.join("\n")
end
+ return nil
end
- # gets a page from the cache if it's still (assumed to be) valid
- # TODO remove stale cached pages, except when called with noexpire=true
- def get_cached(uri_or_str, readtimeout=10, opentimeout=5,
- max_redir=@bot.config['http.max_redir'],
- noexpire=@bot.config['http.no_expire_cache'])
- if uri_or_str.kind_of?(URI)
- uri = uri_or_str
- else
- uri = URI.parse(uri_or_str.to_s)
- end
- debug "Getting cached #{uri}"
-
- if expired?(uri, readtimeout, opentimeout)
- debug "Cache expired"
- bod = get(uri, readtimeout, opentimeout, max_redir, [noexpire])
- bod.instance_variable_set(:@cached,false)
- else
- k = uri.to_s
- debug "Using cache"
- @cache[k][:count] += 1
- @cache[k][:last_use] = Time.now
- bod = @cache[k][:body]
- bod.instance_variable_set(:@cached,true)
- end
- unless noexpire
- remove_stale_cache
- end
- unless bod.respond_to?(:cached?)
- def bod.cached?
- return @cached
- end
+ def post(uri, data, options = {}, &block)
+ opts = {:method => :post, :body => data, :cache => false}.merge(options)
+ begin
+ resp = get_response(uri, opts, &block)
+ raise 'http error' unless Net::HTTPOK === resp
+ return resp
+ rescue Exception => e
+ error e.message
+ error e.backtrace.join("\n")
end
- return bod
+ return nil
end
- # We consider a page to be manually expired if it has no
- # etag and no last-modified and if any of the expiration
- # conditions are met (expire_time, max_cache_time, Expires)
- def manually_expired?(hash, time)
- auto = hash[:etag].empty? and hash[:last_modified].nil?
- # TODO max-age
- manual = (time - hash[:last_use] > @bot.config['http.expire_time']*60) or
- (time - hash[:first_use] > @bot.config['http.max_cache_time']*60) or
- (hash[:expires] < time)
- return (auto and manual)
+ def get_partial(uri, nbytes = @bot.config['http.info_bytes'], options = {}, &block)
+ opts = {:range => "bytes=0-#{nbytes}"}.merge(options)
+ return get(uri, opts, &block)
end
def remove_stale_cache
debug "Removing stale cache"
+ now = Time.new
+ max_last = @bot.config['http.expire_time'] * 60
+ max_first = @bot.config['http.max_cache_time'] * 60
debug "#{@cache.size} pages before"
begin
- now = Time.new
- @cache.reject! { |k, val|
- manually_expired?(val, now)
- }
+ @cache.reject! { |k, val|
+ (now - val.last_used > max_last) || (now - val.first_used > max_first)
+ }
rescue => e
error "Failed to remove stale cache: #{e.inspect}"
end
debug "#{@cache.size} pages after"
end
+
end
end
end
+
+class HttpUtilPlugin < CoreBotModule
+ def initialize(*a)
+ super(*a)
+ debug 'initializing httputil'
+ @bot.httputil = Irc::Utils::HttpUtil.new(@bot)
+ end
+
+ def cleanup
+ debug 'shutting down httputil'
+ @bot.httputil.cleanup
+ @bot.httputil = nil
+ end
+end
+
+HttpUtilPlugin.new
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 251e7986..717630e3 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -318,6 +318,7 @@ module ::Irc
end
def Utils.bot=(b)
+ debug "initializing utils"
@@bot = b
@@safe_save_dir = "#{@@bot.botclass}/safe_save"
end
@@ -523,7 +524,7 @@ module ::Irc
# FIXME what happens if some big file is returned? We should share
# code with the url plugin to only retrieve partial file content!
- xml = self.bot.httputil.get_cached(url)
+ xml = self.bot.httputil.get(url)
if xml.nil?
debug "Unable to retrieve #{url}"
next
@@ -549,3 +550,5 @@ module ::Irc
end
end
+
+Irc::Utils.bot = Irc::Plugins.manager.bot