Skip to content
This repository was archived by the owner on Apr 5, 2021. It is now read-only.

Commit 25e8f61

Browse files
authored
Delta Updates (#50)
* add delta import and update document_builder for :only AND :nest for delta scenario * allow for creating docs that contain a :nest option * make sure parent_missing == skip on delta update * update creating document check allow :root flag * allow for s3 region to be specified in environment * fix querystring example * update rake task description
1 parent 1035847 commit 25e8f61

File tree

13 files changed

+325
-27
lines changed

13 files changed

+325
-27
lines changed

API.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ For example: `school.degrees_awarded.predominant=2,3,4` will match records with
163163

164164
To exclude a set of records from results, use a negative match (also known as an inverted match). Append the characters `__not` to the parameter name to specify a negative match.
165165

166-
For example: `school.region_id__not==5` matches on records where the `school.region_id` does _not_ equal `5`.
166+
For example: `school.region_id__not=5` matches on records where the `school.region_id` does _not_ equal `5`.
167167

168168
### Range matches with the `__range` operator
169169

lib/data_magic.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def self.s3
4747
if ENV['VCAP_APPLICATION']
4848
s3cred = ::CF::App::Credentials.find_by_service_name(ENV['s3_bucket_service'] || 'bservice')
4949
else
50-
s3cred = {'access_key'=> ENV['s3_access_key'], 'secret_key' => ENV['s3_secret_key']}
50+
s3cred = {'access_key'=> ENV['s3_access_key'], 'secret_key' => ENV['s3_secret_key'], 'region' => ENV['s3_region']}
5151
end
5252
# logger.info "s3cred = #{s3cred.inspect}"
5353
if ENV['RACK_ENV'] != 'test'

lib/data_magic/index.rb

Lines changed: 76 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,49 +20,58 @@ def self.import_csv(data, options={})
2020
Index::Importer.process(data, options)
2121
end
2222

23-
# pre-condition: index is already created w/ config
24-
def self.index_with_dictionary(options = {})
23+
def self.log_index_start
2524
start_time = Time.now
26-
Config.logger.debug "--- index_with_dictionary, starting at #{start_time}"
25+
Config.logger.debug "--- Indexing Begins, starting at #{start_time}"
26+
start_time
27+
end
28+
29+
def self.log_index_end(start_time)
30+
end_time = Time.now
31+
logger.debug "indexing complete: #{distance_of_time_in_words(end_time, start_time)}"
32+
logger.debug "duration: #{end_time - start_time}"
33+
end
2734

28-
logger.info "files: #{self.config.files}"
35+
def self.index_file_process(options = {}, filepath)
36+
begin
37+
logger.debug "--"*40
38+
logger.debug "-- #{filepath}"
39+
logger.debug "--"*40
40+
file_start = Time.now
41+
data = config.read_path(filepath)
42+
rows, _ = DataMagic.import_csv(data, options)
43+
file_end = Time.now
44+
logger.debug "imported #{rows} rows in #{distance_of_time_in_words(file_end, file_start)}, ms: #{file_end - file_start}"
45+
rescue DataMagic::InvalidData => e
46+
Config.logger.debug "Error: skipping #{filepath}, #{e.message}"
47+
end
48+
end
2949

50+
# pre-condition: index is already created w/ config
51+
def self.index_with_dictionary(options = {})
52+
start_time = log_index_start
3053
# optionally continue importing from a named file (see import.rake)
3154
starting_from = 0
3255
if options[:continue]
3356
starting_from = config.files.find_index { |file| file.match( /#{options[:continue]}/ ) }
3457
logger.info "Indexing continues with file: #{options[:continue]}" unless starting_from.nil?
3558
end
36-
59+
logger.info "files: #{self.config.files[starting_from.to_i..-1]}"
3760
config.files[starting_from.to_i..-1].each_with_index do |filepath, index|
3861
fname = filepath.split('/').last
3962
logger.debug "indexing #{fname} #{starting_from + index} file config:#{config.additional_data_for_file(starting_from + index).inspect}"
4063
options[:add_data] = config.additional_data_for_file(starting_from + index)
64+
options[:root] = config.info_for_file(starting_from + index, :root)
4165
options[:only] = config.info_for_file(starting_from + index, :only)
4266
options[:nest] = config.info_for_file(starting_from + index, :nest)
43-
begin
44-
logger.debug "--"*40
45-
logger.debug "-- #{filepath}"
46-
logger.debug "--"*40
47-
file_start = Time.now
48-
data = config.read_path(filepath)
49-
rows, _ = DataMagic.import_csv(data, options)
50-
file_end = Time.now
51-
logger.debug "imported #{rows} rows in #{distance_of_time_in_words(file_end, file_start)}, ms: #{file_end - file_start}"
52-
rescue DataMagic::InvalidData => e
53-
Config.logger.debug "Error: skipping #{filepath}, #{e.message}"
54-
end
67+
index_file_process(options, filepath)
5568
end
56-
end_time = Time.now
57-
logger.debug "indexing complete: #{distance_of_time_in_words(end_time, start_time)}"
58-
logger.debug "duration: #{end_time - start_time}"
69+
log_index_end(start_time)
5970
end
6071

6172
def self.import_with_dictionary(options = {})
62-
#logger.debug("field_mapping: #{field_mapping.inspect}")
6373
options[:mapping] = config.field_mapping
6474
options = options.merge(config.options)
65-
6675
es_index_name = self.config.load_datayaml(options[:data_path])
6776
unless config.index_exists?(es_index_name)
6877
logger.info "creating #{es_index_name}" # TO DO: fix #14
@@ -73,6 +82,51 @@ def self.import_with_dictionary(options = {})
7382

7483
end # import_with_dictionary
7584

85+
def self.index_with_delta(options = {})
86+
# delta updates the current index with a single file
87+
if options[:delta_original]
88+
start_time = log_index_start
89+
# find the index of the delta file from the config by the :delta_only key (see delta.rake)
90+
original_file_index = nil
91+
config.files.each_with_index do|file, index|
92+
if config.info_for_file(index, :delta_only)
93+
original_file_index = index
94+
end
95+
end
96+
97+
unless original_file_index
98+
raise ArgumentError, "delta_original file must contiain :delta_only key in data.yaml. No :delta_only key found."
99+
end
100+
101+
# use specified :delta_update filename, or fall back to :delta_original if not provided
102+
delta_filename = options[:delta_update] || options[:delta_original]
103+
config.files[original_file_index..original_file_index].each do |filepath|
104+
original_fname = filepath.split('/').last
105+
# update filepath to use a "delta" subdirectory within DATA_PATH (e.g, <DATA_PATH>/delta/<CSV_FILE> )
106+
delta_filepath = filepath.gsub(/#{original_fname}/, "delta/#{delta_filename}" )
107+
logger.debug "delta update with #{delta_filename} file config:#{config.additional_data_for_file(original_file_index).inspect}"
108+
options[:add_data] = config.additional_data_for_file(original_file_index)
109+
# Append the :delta_only array as our :only fields
110+
options[:only] = config.info_for_file(original_file_index, :delta_only)
111+
options[:nest] = config.info_for_file(original_file_index, :nest)
112+
options[:root] = false # we are not creating new documents
113+
options[:nest][:parent_missing] = 'skip' # we allow skips
114+
index_file_process(options, delta_filepath)
115+
end
116+
log_index_end(start_time)
117+
else
118+
raise ArgumentError, "delta.rake requires 'delta_original' argument to be a filename from the config. No option[:delta_original] provided."
119+
end
120+
end
121+
122+
# pre-condition: index is already created w/ config
123+
def self.import_with_delta(options = {})
124+
options[:mapping] = config.field_mapping
125+
options = options.merge(config.options)
126+
es_index_name = self.config.load_datayaml(options[:data_path])
127+
index_with_delta(options)
128+
end # import_with_delta
129+
76130
private
77131
def self.valid_types
78132
%w[integer float string literal name autocomplete boolean]

lib/data_magic/index/document_builder.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def logger
1313
# where all column_names and values are strings
1414
# fields: column_name => field_name
1515
# config: DataMagic.Config instance for dictionary, column types, NULL
16+
# return: hash - may include :only, :nest, or both :only AND :nest fields/value pairs as specified in config.files
1617
def build(row, builder_data, config)
1718
fields = builder_data.new_field_names
1819
options = builder_data.options
@@ -27,8 +28,14 @@ def build(row, builder_data, config)
2728
field_values.merge!(lowercase_columns(field_values, config.column_field_types))
2829
field_values.merge!(additional) if additional
2930
doc = NestedHash.new.add(field_values)
30-
doc = parse_nested(doc, options) if options[:nest]
31-
doc = select_only_fields(doc, options[:only]) unless options[:only].nil?
31+
if options[:only] && options[:nest]
32+
doc_from_only = select_only_fields(doc, options[:only])
33+
doc_from_nest = parse_nested(doc, options)
34+
doc = doc_from_only.merge!(doc_from_nest)
35+
else
36+
doc = parse_nested(doc, options) if options[:nest]
37+
doc = select_only_fields(doc, options[:only]) unless options[:only].nil?
38+
end
3239
doc
3340
end
3441

lib/data_magic/index/super_client.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def refresh_index
1919
end
2020

2121
def creating?
22-
options[:nest] == nil
22+
options[:root] || options[:nest] == nil
2323
end
2424

2525
def allow_skips?
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
version: 1.5.5
2+
api: school
3+
index: fake-nested-delta
4+
unique: [id]
5+
6+
dictionary:
7+
id: UNITID
8+
name:
9+
source: INSTNM
10+
type: literal
11+
city: CITY_MAIN
12+
state: STABBR_MAIN
13+
zipcode: ZIP_MAIN
14+
location.lat: LATITUDE_MAIN
15+
location.lon: LONGITUDE_MAIN
16+
under_investigation:
17+
source: HCM2
18+
type: integer
19+
description: under ED monitoring flag
20+
21+
sat_average: SAT_AVG
22+
earnings.6_yrs_after_entry.median:
23+
source: earn_2002_p10
24+
description: Median earnings of students
25+
type: integer
26+
27+
earnings.6_yrs_after_entry.percent_gt_25k:
28+
source: gt_25k_2006_p6
29+
description: Share of students earning over $25,000/year
30+
type: float
31+
32+
files:
33+
- name: latest-school-data.csv
34+
only: [id, name, city, state, under_investigation]
35+
- name: latest-school-data.csv
36+
nest:
37+
key: latest
38+
contents: [earnings, sat_average]
39+
delta_only: [city, state, under_investigation]
40+
- name: school2013.csv
41+
nest:
42+
key: 2013
43+
contents: [earnings, sat_average]
44+
- name: school2012.csv
45+
nest:
46+
key: 2012
47+
contents: [earnings, sat_average]
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6,HCM2
2+
1,Normal,AL,1,35762,5,34.7834,-86.5685,Alabama A & M University,1195,30000,0.53,1
3+
2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61.0
4+
3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50,1
5+
4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1900,0.1,0
6+
5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82,1
7+
6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06,0
8+
7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50,1
9+
8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59,0
10+
9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19,1
11+
10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59,0
12+
11,Montgomery,NULL,1,36117,5,32.3643,-86.2957,Auburn University at Montgomery,940,49879,0.64,1
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6,HCM2
2+
1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53,0
3+
2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61.0
4+
3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50,0
5+
4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09,0
6+
5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82,0
7+
6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06,0
8+
7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50,0
9+
8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59,0
10+
9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19,0
11+
10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59,0
12+
11,Montgomery,NULL,1,36117,5,32.3643,-86.2957,Auburn University at Montgomery,940,49879,0.64,0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6,HCM2
2+
1,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,461,35231,0.01,0
3+
2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,986,34095,0.71,0
4+
3,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1094,42579,0.39,0
5+
4,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,854,37589,0.15,0
6+
5,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,650,13611,0.04,0
7+
6,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,797,36924,0.64,0
8+
7,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,994,31799,0.60,0
9+
8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1420,30063,0.97,0
10+
9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1292,42150,0.83,0
11+
10,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,605,2608,0.92,0
12+
11,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2608,0.92,0
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
UNITID,CITY_MAIN,STABBR_MAIN,ST_FIPS_MAIN,ZIP_MAIN,REGION_MAIN,LATITUDE_MAIN,LONGITUDE_MAIN,INSTNM,SAT_AVG,earn_2002_p10,gt_25k_2006_p6,HCM2
2+
1,Normal,AL,1,35762,5,34.7834,-86.5685,Reichert University,1195,26318,0.53,0
3+
2,Montgomery,AL,1,36109-3378,5,32.3842,-86.2164,Montgomery School,770,6785,0.61,0
4+
3,Montevallo,AL,1,35115-6000,5,33.1063,-86.8651,Indigo Card Community College,526,16767,0.50,0
5+
4,Montgomery,AL,1,36104-0271,5,32.3643,-86.2957,Warm Meadow School of Fine Art,457,1836,0.09,0
6+
5,Alexander City,AL,1,35010,5,32.9244,-85.9465,Kovacek Institute of Technology,1511,19372,0.82,0
7+
6,Athens,AL,1,35611,5,34.8056,-86.9651,Athens Institute,1057,49203,0.06,0
8+
7,Auburn University,AL,1,36849,5,32.6002,-85.4924,Alabama Beauty College of Auburn University,486,44097,0.50,0
9+
8,Birmingham,AL,1,35254,5,33.5155,-86.8536,Condemned Balloon Institute,616,59759,0.59,0
10+
9,Tanner,AL,1,35671,5,34.6543,-86.9491,Inquisitive Farm College,971,34183,0.19,0
11+
10,Enterprise,AL,1,36330-1300,5,31.2975,-85.837,Enterprise University,920,42629,0.59,0

0 commit comments

Comments
 (0)