lib/rbot/core/utils/extends.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

#-- vim:sw=2:et
#++
#
# :title: Standard classes extensions
#
# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
# Copyright:: (C) 2006,2007 Giuseppe Bilotta
# License:: GPL v2
#
# This file collects extensions to standard Ruby classes and to some core rbot
# classes to be used by the various plugins
#
# Please note that global symbols have to be prefixed by :: because this plugin
# will be read into an anonymous module


# Extensions to the Array class
#
class ::Array

  # This method returns a random element from the array, or nil if the array is
  # empty
  #
  def pick_one
    return nil if self.empty?
    self[rand(self.length)]
  end
end

begin
  require 'iconv'
  $we_have_iconv = true
rescue LoadError
  $we_have_iconv = false
end

# Extensions to the String class
#
# TODO make ircify_html() accept an Hash of options, and make riphtml() just
# call ircify_html() with stronger purify options.
#
class ::String

  # This method will try to transcode a String supposed to hold an XML or HTML
  # document from the original charset to UTF-8.
  #
  # To find the original encoding, it will first see if the String responds to
  # #http_headers(), and if it does it will assume that the charset indicated
  # there is the correct one. Otherwise, it will try to detect the charset from
  # some typical XML and HTML headers
  def utfy_xml
    return self unless $we_have_iconv

    charset = nil

    if self.respond_to?(:http_headers) and headers = self.http_headers
      if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i)
        debug "charset #{charset} set from header"
        charset = $1
      end
    end

    if not charset
      case self
      when /<\?xml.*encoding="(\S+)".*\?>/i
        charset = $1
      when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i
        charset = $1
      end
      debug "charset #{charset} set from string"
    end

    if charset
      return Iconv.iconv('utf-8', charset, self).join rescue self
    else
      debug "Couldn't find charset for #{self.inspect}"
      return self
    end

  end

  # This method will return a purified version of the receiver, with all HTML
  # stripped off and some of it converted to IRC formatting
  #
  def ircify_html
    txt = self

    # remove scripts
    txt.gsub!(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "")

    # remove styles
    txt.gsub!(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")

    # bold and strong -> bold
    txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}")

    # italic, emphasis and underline -> underline
    txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}")

    ## This would be a nice addition, but the results are horrible
    ## Maybe make it configurable?
    # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}")

    # Paragraph and br tags are converted to whitespace
    txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ')
    txt.gsub!("\n", ' ')
    txt.gsub!("\r", ' ')

    # All other tags are just removed
    txt.gsub!(/<[^>]+>/, '')

    # Convert HTML entities. We do it now to be able to handle stuff
    # such as &nbsp;
    txt = Utils.decode_html_entities(txt)

    # Remove double formatting options, since they only waste bytes
    txt.gsub!(/#{Bold}(\s*)#{Bold}/, '\1')
    txt.gsub!(/#{Underline}(\s*)#{Underline}/, '\1')

    # Simplify whitespace that appears on both sides of a formatting option
    txt.gsub!(/\s+(#{Bold}|#{Underline})\s+/, ' \1')
    txt.sub!(/\s+(#{Bold}|#{Underline})\z/, '\1')
    txt.sub!(/\A(#{Bold}|#{Underline})\s+/, '\1')

    # And finally whitespace is squeezed
    txt.gsub!(/\s+/, ' ')

    # Decode entities and strip whitespace
    return txt.strip
  end

  # This method will strip all HTML crud from the receiver
  #
  def riphtml
    self.gsub(/<[^>]+>/, '').gsub(/&amp;/,'&').gsub(/&quot;/,'"').gsub(/&lt;/,'<').gsub(/&gt;/,'>').gsub(/&ellip;/,'...').gsub(/&apos;/, "'").gsub("\n",'')
  end
end


# Extensions to the Regexp class, with some common and/or complex regular
# expressions.
#
class ::Regexp

  # A method to build a regexp that matches a list of something separated by
  # optional commas and/or the word "and", an optionally repeated prefix,
  # and whitespace.
  def Regexp.new_list(reg, pfx = "")
    if pfx.kind_of?(String) and pfx.empty?
      return %r(#{reg}(?:,?(?:\s+and)?\s+#{reg})*)
    else
      return %r(#{reg}(?:,?(?:\s+and)?(?:\s+#{pfx})?\s+#{reg})*)
    end
  end

  IN_ON = /in|on/

  module Irc
    # Match a list of channel anmes separated by optional commas, whitespace
    # and optionally the word "and"
    CHAN_LIST = Regexp.new_list(GEN_CHAN)

    # Match "in #channel" or "on #channel" and/or "in private" (optionally
    # shortened to "in pvt"), returning the channel name or the word 'private'
    # or 'pvt' as capture
    IN_CHAN = /#{IN_ON}\s+(#{GEN_CHAN})|(here)|/
    IN_CHAN_PVT = /#{IN_CHAN}|in\s+(private|pvt)/

    # As above, but with channel lists
    IN_CHAN_LIST_SFX = Regexp.new_list(/#{GEN_CHAN}|here/, IN_ON)
    IN_CHAN_LIST = /#{IN_ON}\s+#{IN_CHAN_LIST_SFX}|anywhere|everywhere/
    IN_CHAN_LIST_PVT_SFX = Regexp.new_list(/#{GEN_CHAN}|here|private|pvt/, IN_ON)
    IN_CHAN_LIST_PVT = /#{IN_ON}\s+#{IN_CHAN_LIST_PVT_SFX}|anywhere|everywhere/

    # Match a list of nicknames separated by optional commas, whitespace and
    # optionally the word "and"
    NICK_LIST = Regexp.new_list(GEN_NICK)

  end

end


module ::Irc


  class BasicUserMessage

    # We extend the BasicUserMessage class with a method that parses a string
    # which is a channel list as matched by IN_CHAN(_LIST) and co. The method
    # returns an array of channel names, where 'private' or 'pvt' is replaced
    # by the Symbol :"?", 'here' is replaced by the channel of the message or
    # by :"?" (depending on whether the message target is the bot or a
    # Channel), and 'anywhere' and 'everywhere' are replaced by Symbol :*
    #
    def parse_channel_list(string)
      return [:*] if [:anywhere, :everywhere].include? string.to_sym
      string.scan(
      /(?:^|,?(?:\s+and)?\s+)(?:in|on\s+)?(#{Regexp::Irc::GEN_CHAN}|here|private|pvt)/
                 ).map { |chan_ar|
        chan = chan_ar.first
        case chan.to_sym
        when :private, :pvt
          :"?"
        when :here
          case self.target
          when Channel
            self.target.name
          else
            :"?"
          end
        else
          chan
        end
      }.uniq
    end
  end
end