# File lib/robotex.rb, line 18 def initialize(uri, user_agent) io = Robotex.get_robots_txt(uri, user_agent) if !io || io.content_type != "text/plain" || io.status != ["200", "OK"] io = StringIO.new("User-agent: *\nAllow: /\n") end @disallows = {} @allows = {} @delays = {} agent = %r.*/ io.each do |line| next if line =~ %r^\s*(#.*|$)/ arr = line.split(":") key = arr.shift value = arr.join(":").strip value.strip! case key.downcase when "user-agent" agent = to_regex(value) when "allow" @allows[agent] ||= [] @allows[agent] << to_regex(value) when "disallow" @disallows[agent] ||= [] @disallows[agent] << to_regex(value) when "crawl-delay" @delays[agent] = value.to_i end end @parsed = true end
# File lib/robotex.rb, line 52 def allowed?(uri, user_agent) return true unless @parsed allowed = true uri = URI.parse(uri.to_s) unless uri.is_a?(URI) path = uri.request_uri @allows.each do |key, value| unless allowed if user_agent =~ key value.each do |rule| if path =~ rule allowed = true end end end end end @disallows.each do |key, value| if user_agent =~ key value.each do |rule| if path =~ rule allowed = false end end end end return allowed end
# File lib/robotex.rb, line 83 def delay(user_agent) @delays.each do |agent, delay| return delay if agent =~ user_agent end nil end
# File lib/robotex.rb, line 92 def to_regex(pattern) return %rshould-not-match-anything-123456789/ if pattern.strip.empty? pattern = Regexp.escape(pattern) pattern.gsub!(Regexp.escape("*"), ".*") Regexp.compile("^#{pattern}") end