|
| 1 | +# Copyright (c) 2014 Inside Systems, Inc All rights reserved. |
| 2 | +# |
| 3 | +# This program is licensed to you under the Apache License Version 2.0, |
| 4 | +# and you may not use this file except in compliance with the Apache License Version 2.0. |
| 5 | +# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. |
| 6 | +# |
| 7 | +# Unless required by applicable law or agreed to in writing, |
| 8 | +# software distributed under the Apache License Version 2.0 is distributed on an |
| 9 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 10 | +# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. |
| 11 | + |
| 12 | +# Author:: Kelley Reynolds (mailto:[email protected]) |
| 13 | +# Copyright:: Copyright (c) 2014 Inside Systems Inc |
| 14 | +# License:: Apache License Version 2.0 |
| 15 | + |
| 16 | +require 'uri' |
| 17 | +require 'cgi' |
| 18 | + |
| 19 | +module RefererParser |
| 20 | + class Parser |
| 21 | + DefaultFile = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'data', 'referers.json')) |
| 22 | + |
| 23 | + # Create a new parser from one or more filenames/uris, defaults to ../data/referers.json |
| 24 | + def initialize(uris=DefaultFile) |
| 25 | + @domain_index ||= {} |
| 26 | + @name_hash ||= {} |
| 27 | + |
| 28 | + update(uris) |
| 29 | + end |
| 30 | + |
| 31 | + # Update the referer database with one or more uris |
| 32 | + def update(uris) |
| 33 | + [uris].flatten.each do |uri| |
| 34 | + deserialize_referer_data(read_referer_data(uri), File.extname(uri).downcase) |
| 35 | + end |
| 36 | + |
| 37 | + true |
| 38 | + end |
| 39 | + |
| 40 | + # Clean out the database |
| 41 | + def clear! |
| 42 | + @domain_index, @name_hash = {}, {} |
| 43 | + |
| 44 | + true |
| 45 | + end |
| 46 | + |
| 47 | + # Add a referer to the database with medium, name, domain or array of domains, and a parameter or array of parameters |
| 48 | + # If called manually and a domain is added to an existing entry with a path, you may need to call optimize_index! afterwards. |
| 49 | + def add_referer(medium, name, domains, parameters=nil) |
| 50 | + # The same name can be used with multiple mediums so we make a key here |
| 51 | + name_key = "#{name}-#{medium}" |
| 52 | + |
| 53 | + # Update the name has with the parameter and medium data |
| 54 | + @name_hash[name_key] = {:source => name, :medium => medium, :parameters => [parameters].flatten } |
| 55 | + |
| 56 | + # Update the domain to name index |
| 57 | + [domains].flatten.each do |domain_url| |
| 58 | + domain, *path = domain_url.split('/') |
| 59 | + if domain =~ /\Awww\.(.*)\z/i |
| 60 | + domain = $1 |
| 61 | + end |
| 62 | + |
| 63 | + domain.downcase! |
| 64 | + |
| 65 | + @domain_index[domain] ||= [] |
| 66 | + if !path.empty? |
| 67 | + @domain_index[domain] << ['/' + path.join('/'), name_key] |
| 68 | + else |
| 69 | + @domain_index[domain] << ['/', name_key] |
| 70 | + end |
| 71 | + end |
| 72 | + end |
| 73 | + |
| 74 | + # Prune duplicate entries and sort with the most specific path first if there is more than one entry |
| 75 | + # In this case, sorting by the longest string works fine |
| 76 | + def optimize_index! |
| 77 | + @domain_index.each do |key, val| |
| 78 | + # Sort each path/name_key pair by the longest path |
| 79 | + @domain_index[key].sort! { |a, b| |
| 80 | + b[0].size <=> a[0].size |
| 81 | + }.uniq! |
| 82 | + end |
| 83 | + end |
| 84 | + |
| 85 | + # Given a string or URI, return a hash of data |
| 86 | + def parse(obj) |
| 87 | + url = obj.is_a?(URI) ? obj : URI.parse(obj.to_s) |
| 88 | + |
| 89 | + if !['http', 'https'].include?(url.scheme) |
| 90 | + raise InvalidUriError.new("Only HTTP and HTTPS schemes are supported -- #{url.scheme}") |
| 91 | + end |
| 92 | + |
| 93 | + data = { :known => false, :uri => url.to_s } |
| 94 | + |
| 95 | + domain, name_key = domain_and_name_key_for(url) |
| 96 | + if domain and name_key |
| 97 | + referer_data = @name_hash[name_key] |
| 98 | + data[:known] = true |
| 99 | + data[:source] = referer_data[:source] |
| 100 | + data[:medium] = referer_data[:medium] |
| 101 | + data[:domain] = domain |
| 102 | + |
| 103 | + # Parse parameters if the referer uses them |
| 104 | + if url.query and referer_data[:parameters] |
| 105 | + query_params = CGI.parse(url.query) |
| 106 | + referer_data[:parameters].each do |param| |
| 107 | + # If there is a matching parameter, get the first non-blank value |
| 108 | + if !(values = query_params[param]).empty? |
| 109 | + data[:term] = values.select { |v| v.strip != "" }.first |
| 110 | + break if data[:term] |
| 111 | + end |
| 112 | + end |
| 113 | + end |
| 114 | + end |
| 115 | + |
| 116 | + data |
| 117 | + rescue URI::InvalidURIError |
| 118 | + raise InvalidUriError.new("Unable to parse URI, not a URI? -- #{obj.inspect}", $!) |
| 119 | + end |
| 120 | + |
| 121 | + protected |
| 122 | + |
| 123 | + # Determine the correct name_key for this host and path |
| 124 | + def domain_and_name_key_for(uri) |
| 125 | + # Create a proc that will return immediately |
| 126 | + check = Proc.new do |domain| |
| 127 | + domain.downcase! |
| 128 | + if paths = @domain_index[domain] |
| 129 | + paths.each do |path, name_key| |
| 130 | + return [domain, name_key] if uri.path.include?(path) |
| 131 | + end |
| 132 | + end |
| 133 | + end |
| 134 | + |
| 135 | + # First check hosts with and without the www prefix with the path |
| 136 | + if uri.host =~ /\Awww\.(.+)\z/i |
| 137 | + check.call $1 |
| 138 | + else |
| 139 | + check.call uri.host |
| 140 | + end |
| 141 | + |
| 142 | + # Remove subdomains until only three are left (probably good enough) |
| 143 | + host_arr = uri.host.split(".") |
| 144 | + while host_arr.size > 2 do |
| 145 | + host_arr.shift |
| 146 | + check.call host_arr.join(".") |
| 147 | + end |
| 148 | + |
| 149 | + nil |
| 150 | + end |
| 151 | + |
| 152 | + def deserialize_referer_data(data, ext) |
| 153 | + # Parse the loaded data with the correct parser |
| 154 | + deserialized_data = if ['.yml', '.yaml'].include?(ext) |
| 155 | + deserialize_yaml(data) |
| 156 | + elsif ext == '.json' |
| 157 | + deserialize_json(data) |
| 158 | + else |
| 159 | + raise UnsupportedFormatError.new("Only yaml and json file formats are currently supported -- #{@msg}") |
| 160 | + end |
| 161 | + |
| 162 | + begin |
| 163 | + parse_referer_data deserialized_data |
| 164 | + rescue |
| 165 | + raise CorruptReferersError.new("Unable to parse data file -- #{$!.class} #{$!.to_s}", $!) |
| 166 | + end |
| 167 | + end |
| 168 | + |
| 169 | + def deserialize_yaml(data) |
| 170 | + require 'yaml' |
| 171 | + YAML.load(data) |
| 172 | + rescue Exception => e |
| 173 | + raise CorruptReferersError.new("Unable to YAML file -- #{e.to_s}", e) |
| 174 | + end |
| 175 | + |
| 176 | + def deserialize_json(data) |
| 177 | + require 'json' |
| 178 | + JSON.parse(data) |
| 179 | + rescue JSON::ParserError |
| 180 | + raise CorruptReferersError.new("Unable to JSON file -- #{$!.to_s}", $!) |
| 181 | + end |
| 182 | + |
| 183 | + def read_referer_data(uri) |
| 184 | + # Attempt to read the data from the network if application, or the file on the local system |
| 185 | + if uri =~ /\A(?:ht|f)tps?:\/\// |
| 186 | + require 'open-uri' |
| 187 | + begin |
| 188 | + open(uri).read |
| 189 | + rescue OpenURI::HTTPError |
| 190 | + raise InvalidUriError.new("Cannot load referer data from URI #{uri} -- #{$!.to_s}", $!) |
| 191 | + end |
| 192 | + else |
| 193 | + File.read(uri) |
| 194 | + end |
| 195 | + end |
| 196 | + |
| 197 | + # Create an index that maps domains/paths to their name/medium and a hash that contains their metadata |
| 198 | + # The index strips leading www in order to keep the index smaller |
| 199 | + # Format of the domain_index: |
| 200 | + # { domain => [[path1, name_key], [path2, name_key], ... ] } |
| 201 | + # Format of the name_hash: |
| 202 | + # { name_key => {:source, :medium, :parameters} } |
| 203 | + def parse_referer_data(data) |
| 204 | + data.each do |medium, name_hash| |
| 205 | + name_hash.each do |name, name_data| |
| 206 | + add_referer(medium, name, name_data['domains'], name_data['parameters']) |
| 207 | + end |
| 208 | + end |
| 209 | + |
| 210 | + optimize_index! |
| 211 | + rescue |
| 212 | + raise CorruptReferersError.new("Unable to parse referer data", $!) |
| 213 | + end |
| 214 | + end |
| 215 | +end |
0 commit comments