|
1 | 1 | package edu.usc.irds.sparkler.model;
|
2 | 2 |
|
| 3 | +import edu.usc.irds.sparkler.Constants; |
3 | 4 | import edu.usc.irds.sparkler.JobContext;
|
4 | 5 | import edu.usc.irds.sparkler.util.StringUtil;
|
5 | 6 | import org.apache.solr.client.solrj.beans.Field;
|
|
8 | 9 | import java.net.MalformedURLException;
|
9 | 10 | import java.net.URL;
|
10 | 11 | import java.util.Date;
|
| 12 | +import java.text.SimpleDateFormat; |
11 | 13 | import java.util.HashMap;
|
12 | 14 | import java.util.Map;
|
13 | 15 | import java.util.Random;
|
@@ -38,6 +40,15 @@ public class Resource implements Serializable {
|
38 | 40 | @Field("http_method") private String httpMethod;
|
39 | 41 | @Field("jobmeta") private String metadata;
|
40 | 42 |
|
| 43 | + private String version = "1.0"; |
| 44 | + private Date modifiedTime = new Date(); |
| 45 | + private String crawler = "sparkler"; |
| 46 | + private Integer fetchDepth = 0; |
| 47 | + private Double pageScore = 0.0; |
| 48 | + private Integer retriesSinceFetch = -1; |
| 49 | + private Integer fetchStatusCode = 0; |
| 50 | + private Long responseTime = new Long(0); |
| 51 | + |
41 | 52 | public Resource() {
|
42 | 53 | }
|
43 | 54 |
|
@@ -101,6 +112,122 @@ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, Resou
|
101 | 112 |
|
102 | 113 | }
|
103 | 114 |
|
| 115 | + public Resource(Map<String, Object> dataMap) { |
| 116 | + System.out.println("Resource constructor ---------------"); |
| 117 | + for (String key : dataMap.keySet()) { |
| 118 | + System.out.println(key + " => " + dataMap.get(key)); |
| 119 | + } |
| 120 | + |
| 121 | + if (dataMap.containsKey("id")) id = (String)dataMap.get("id"); |
| 122 | + if (dataMap.containsKey("url")) url = (String)dataMap.get("url"); |
| 123 | + if (dataMap.containsKey("group")) group = (String)dataMap.get("group"); |
| 124 | + if (dataMap.containsKey("discover_depth")) { |
| 125 | + try { |
| 126 | + discoverDepth = (Integer)dataMap.get("discover_depth"); |
| 127 | + } catch (Exception e) { |
| 128 | + System.err.println("Could not retrieve and parse to Integer: discover_depth"); |
| 129 | + System.err.println(e.toString()); |
| 130 | + } |
| 131 | + } |
| 132 | + if (dataMap.containsKey("status")) status = (String)dataMap.get("status"); |
| 133 | + if (dataMap.containsKey("fetch_timestamp")) { |
| 134 | + try { |
| 135 | + fetchTimestamp = new SimpleDateFormat(Constants.defaultDateFormat).parse((String)dataMap.get("fetch_timestamp")); |
| 136 | + } catch (Exception e) { |
| 137 | + System.err.println("Could not retrieve and parse to Date: fetch_timestamp"); |
| 138 | + System.err.println(e.toString()); |
| 139 | + } |
| 140 | + } |
| 141 | + if (dataMap.containsKey("crawl_id")) crawlId = (String)dataMap.get("crawl_id"); |
| 142 | + if (dataMap.containsKey("dedupe_id")) dedupeId = (String)dataMap.get("dedupe_id"); |
| 143 | + if (dataMap.containsKey("*_score")) { |
| 144 | + try { |
| 145 | + score = (HashMap<String, Double>)dataMap.get("*_score"); |
| 146 | + } catch (Exception e) { |
| 147 | + System.err.println("Could not retrieve and parse to HashMap<String, Double>: *_score"); |
| 148 | + System.err.println(e.toString()); |
| 149 | + } |
| 150 | + } |
| 151 | + if (dataMap.containsKey("generate_score")) { |
| 152 | + try { |
| 153 | + generateScore = (Double)dataMap.get("generate_score"); |
| 154 | + } catch (Exception e) { |
| 155 | + System.err.println("Could not retrieve and parse to Double: generate_score"); |
| 156 | + System.err.println(e.toString()); |
| 157 | + } |
| 158 | + } |
| 159 | + if (dataMap.containsKey("http_method")) httpMethod = (String)dataMap.get("http_method"); |
| 160 | + if (dataMap.containsKey("jobmeta")) metadata = (String)dataMap.get("jobmeta"); |
| 161 | + if (dataMap.containsKey("last_updated_at")) { |
| 162 | + try { |
| 163 | + lastUpdatedAt = new SimpleDateFormat(Constants.defaultDateFormat).parse((String)dataMap.get("last_updated_at")); |
| 164 | + } catch (Exception e) { |
| 165 | + System.err.println("Could not retrieve and parse to Date: last_updated_at"); |
| 166 | + System.err.println(e.toString()); |
| 167 | + } |
| 168 | + } |
| 169 | + if (dataMap.containsKey("indexed_at")) { |
| 170 | + try { |
| 171 | + indexedAt = new SimpleDateFormat(Constants.defaultDateFormat).parse((String)dataMap.get("indexed_at")); |
| 172 | + } catch (Exception e) { |
| 173 | + System.err.println("Could not retrieve and parse to Date: indexed_at"); |
| 174 | + System.err.println(e.toString()); |
| 175 | + } |
| 176 | + } |
| 177 | + if (dataMap.containsKey("hostname")) hostname = (String)dataMap.get("hostname"); |
| 178 | + if (dataMap.containsKey("parent")) parent = (String)dataMap.get("parent"); |
| 179 | + if (dataMap.containsKey("version")) version = (String)dataMap.get("version"); |
| 180 | + if (dataMap.containsKey("modified_time")) { |
| 181 | + try { |
| 182 | + modifiedTime = new SimpleDateFormat(Constants.defaultDateFormat).parse((String)dataMap.get("modified_time")); |
| 183 | + } catch (Exception e) { |
| 184 | + System.err.println("Could not retrieve and parse to Date: modified_time"); |
| 185 | + System.err.println(e.toString()); |
| 186 | + } |
| 187 | + } |
| 188 | + if (dataMap.containsKey("crawler")) crawler = (String)dataMap.get("crawler"); |
| 189 | + if (dataMap.containsKey("fetch_depth")) { |
| 190 | + try { |
| 191 | + fetchDepth = (Integer)dataMap.get("fetch_depth"); |
| 192 | + } catch (Exception e) { |
| 193 | + System.err.println("Could not retrieve and parse to Integer: fetch_depth"); |
| 194 | + System.err.println(e.toString()); |
| 195 | + } |
| 196 | + } |
| 197 | + if (dataMap.containsKey("page_score")) { |
| 198 | + try { |
| 199 | + pageScore = (Double)dataMap.get("page_score"); |
| 200 | + } catch (Exception e) { |
| 201 | + System.err.println("Could not retrieve and parse to Double: page_score"); |
| 202 | + System.err.println(e.toString()); |
| 203 | + } |
| 204 | + } |
| 205 | + if (dataMap.containsKey("retries_since_fetch")) { |
| 206 | + try { |
| 207 | + retriesSinceFetch = (Integer)dataMap.get("retries_since_fetch"); |
| 208 | + } catch (Exception e) { |
| 209 | + System.err.println("Could not retrieve and parse to Integer: retries_since_fetch"); |
| 210 | + System.err.println(e.toString()); |
| 211 | + } |
| 212 | + } |
| 213 | + if (dataMap.containsKey("fetch_status_code")) { |
| 214 | + try { |
| 215 | + fetchStatusCode = (Integer)dataMap.get("fetch_status_code"); |
| 216 | + } catch (Exception e) { |
| 217 | + System.err.println("Could not retrieve and parse to Integer: fetch_status_code"); |
| 218 | + System.err.println(e.toString()); |
| 219 | + } |
| 220 | + } |
| 221 | + if (dataMap.containsKey("response_time")) { |
| 222 | + try { |
| 223 | + responseTime = new Long((String)dataMap.get("response_time")); |
| 224 | + } catch (Exception e) { |
| 225 | + System.err.println("Could not retrieve and parse to Long: response_time"); |
| 226 | + System.err.println(e.toString()); |
| 227 | + } |
| 228 | + } |
| 229 | + } |
| 230 | + |
104 | 231 | @Override
|
105 | 232 | public String toString() {
|
106 | 233 | return String.format("Resource(%s, %s, %s, %d, %f, %s)",
|
@@ -203,4 +330,70 @@ public String getHttpMethod(){
|
203 | 330 | public String getMetadata(){
|
204 | 331 | return this.metadata;
|
205 | 332 | }
|
| 333 | + |
| 334 | + public Date getLastUpdatedAt() { |
| 335 | + return this.lastUpdatedAt; |
| 336 | + } |
| 337 | + |
| 338 | + public Date getIndexedAt() { |
| 339 | + return this.indexedAt; |
| 340 | + } |
| 341 | + |
| 342 | + public String getHostname() { |
| 343 | + return this.hostname; |
| 344 | + } |
| 345 | + |
| 346 | + public String getParent() { |
| 347 | + return this.parent; |
| 348 | + } |
| 349 | + |
| 350 | + public String getVersion() { return this.version; } |
| 351 | + |
| 352 | + public Date getModifiedTime() { return this.modifiedTime; } |
| 353 | + |
| 354 | + public String getCrawler() { return this.crawler; } |
| 355 | + |
| 356 | + public Integer getFetchDepth() { return this.fetchDepth; } |
| 357 | + |
| 358 | + public Double getPageScore() { return this.pageScore; } |
| 359 | + |
| 360 | + public Integer getRetriesSinceFetch() { return this.retriesSinceFetch; } |
| 361 | + |
| 362 | + public Integer getFetchStatusCode() { return this.fetchStatusCode; } |
| 363 | + |
| 364 | + public Long getResponseTime() { return this.responseTime; } |
| 365 | + |
| 366 | + public Map<String, Object> getDataAsMap() { |
| 367 | + Map<String, Object> dataMap = new HashMap<String, Object>() {{ |
| 368 | + put("id", getId()); |
| 369 | + put("url", getUrl()); |
| 370 | + put("group", getGroup()); |
| 371 | + put("discover_depth", getDiscoverDepth()); |
| 372 | + put("status", getStatus()); |
| 373 | + put("fetch_timestamp", getFetchTimestamp()); |
| 374 | + put("crawl_id", getCrawlId()); |
| 375 | + put("dedupe_id", getDedupeId()); |
| 376 | + put("generate_score", getGenerateScore()); |
| 377 | + put("http_method", getHttpMethod()); |
| 378 | + put("jobmeta", getMetadata()); |
| 379 | + put("last_updated_at", getLastUpdatedAt()); |
| 380 | + put("indexed_at", getIndexedAt()); |
| 381 | + put("hostname", getHostname()); |
| 382 | + put("parent", getParent()); |
| 383 | + put("version", getVersion()); |
| 384 | + put("modified_time", getModifiedTime()); |
| 385 | + put("crawler", getCrawler()); |
| 386 | + put("fetch_depth", getFetchDepth()); |
| 387 | + put("page_score", getPageScore()); |
| 388 | + put("retries_since_fetch", getRetriesSinceFetch()); |
| 389 | + put("fetch_status_code", getFetchStatusCode()); |
| 390 | + put("response_time", getResponseTime()); |
| 391 | + }}; |
| 392 | + |
| 393 | + Map<String, Double> scores = getScore(); |
| 394 | + for (String key : scores.keySet()) { |
| 395 | + dataMap.put(key, scores.get(key)); |
| 396 | + } |
| 397 | + return dataMap; |
| 398 | + } |
206 | 399 | }
|
0 commit comments