Skip to content

Commit 990b9ee

Browse files
author
Mark Hale
committed
Collect more stats.
1 parent ecbc941 commit 990b9ee

1 file changed

Lines changed: 34 additions & 10 deletions

File tree

tools/src/main/java/com/msd/gin/halyard/tools/HalyardStats.java

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ public final class HalyardStats extends AbstractHalyardTool {
121121

122122
private static final long DEFAULT_GRAPH_THRESHOLD = 1000;
123123
private static final long DEFAULT_PARTITION_THRESHOLD = 5000;
124+
private static final int DEFAULT_DISTINCT2_THRESHOLD = 1000;
124125

125126
enum Counters {
126127
REMOVED_STATEMENTS,
@@ -185,9 +186,11 @@ static final class StatsMapper extends RdfTableMapper<ImmutableBytesWritable, Lo
185186
Value subsetId;
186187
Set<Value> lastSubsetIds;
187188
HashTracker hashTracker;
188-
long subsetDistincts;
189189
Set<Value> lastSubsetDistincts;
190190
IRI subsetDistinctType;
191+
Set<Value> lastSubsetDistincts2;
192+
int distinct2Threshold = DEFAULT_DISTINCT2_THRESHOLD;
193+
IRI subsetDistinct2Type;
191194
HashTracker subhashTracker;
192195
long setThreshold, setCounter, subsetThreshold, subsetCounter;
193196
long instanceOfCounter, classPartitionThreshold;
@@ -246,20 +249,23 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
246249
subhashLen = index.getRole(TermRole.PREDICATE).keyHashSize();
247250
subsetType = VOID_EXT.SUBJECT;
248251
subsetDistinctType = VOID.PROPERTIES;
252+
subsetDistinct2Type = VOID.DISTINCT_OBJECTS;
249253
break;
250254
case POS:
251255
case CPOS:
252256
hashLen = index.getRole(TermRole.PREDICATE).keyHashSize();
253257
subhashLen = index.getRole(TermRole.OBJECT).keyHashSize();
254258
subsetType = VOID.PROPERTY;
255259
subsetDistinctType = VOID.DISTINCT_OBJECTS;
260+
subsetDistinct2Type = VOID.DISTINCT_SUBJECTS;
256261
break;
257262
case OSP:
258263
case COSP:
259264
hashLen = index.getRole(TermRole.OBJECT).keyHashSize();
260265
subhashLen = index.getRole(TermRole.SUBJECT).keyHashSize();
261266
subsetType = VOID_EXT.OBJECT;
262267
subsetDistinctType = VOID.DISTINCT_SUBJECTS;
268+
subsetDistinct2Type = VOID.PROPERTIES;
263269
break;
264270
default:
265271
throw new IOException("Unknown index #" + index);
@@ -273,10 +279,12 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
273279
if (!hashTracker.equals(key)) {
274280
lastSubsetIds = new HashSet<>();
275281
lastSubsetDistincts = new HashSet<>();
282+
lastSubsetDistincts2 = new HashSet<>(distinct2Threshold);
276283
}
277284

278285
if (!subhashTracker.equals(key)) {
279286
lastSubsetDistincts = new HashSet<>();
287+
lastSubsetDistincts2 = new HashSet<>(distinct2Threshold);
280288
}
281289

282290
Statement[] stmts = stmtIndices.parseStatements(null, null, null, null, value, vf);
@@ -320,8 +328,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
320328
subsetId = subj;
321329
}
322330
IRI pred = stmt.getPredicate();
323-
if (lastSubsetDistincts.add(pred)) {
324-
subsetDistincts++;
331+
lastSubsetDistincts.add(pred);
332+
if (lastSubsetDistincts2 != null) {
333+
Value obj = stmt.getObject();
334+
lastSubsetDistincts2.add(obj);
335+
if (lastSubsetDistincts2.size() > distinct2Threshold) {
336+
lastSubsetDistincts2 = null;
337+
}
325338
}
326339
triples++;
327340
}
@@ -336,8 +349,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
336349
subsetId = pred;
337350
}
338351
Value obj = stmt.getObject();
339-
if (lastSubsetDistincts.add(obj)) {
340-
subsetDistincts++;
352+
lastSubsetDistincts.add(obj);
353+
if (lastSubsetDistincts2 != null) {
354+
Value subj = stmt.getSubject();
355+
lastSubsetDistincts2.add(subj);
356+
if (lastSubsetDistincts2.size() > distinct2Threshold) {
357+
lastSubsetDistincts2 = null;
358+
}
341359
}
342360
if (RDF.TYPE.equals(pred)) {
343361
if (!obj.equals(rdfClass)) {
@@ -368,8 +386,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
368386
subsetId = obj;
369387
}
370388
Value subj = stmt.getSubject();
371-
if (lastSubsetDistincts.add(subj)) {
372-
subsetDistincts++;
389+
lastSubsetDistincts.add(subj);
390+
if (lastSubsetDistincts2 != null) {
391+
Value pred = stmt.getPredicate();
392+
lastSubsetDistincts2.add(pred);
393+
if (lastSubsetDistincts2.size() > distinct2Threshold) {
394+
lastSubsetDistincts2 = null;
395+
}
373396
}
374397
}
375398
break;
@@ -482,14 +505,15 @@ private void reset(Context output) throws IOException, InterruptedException {
482505
}
483506

484507
private void resetSubset(Context output) throws IOException, InterruptedException {
485-
assert subsetDistincts <= subsetCounter;
508+
assert lastSubsetDistincts.size() <= subsetCounter;
486509
if (subsetCounter >= subsetThreshold) {
487510
report(output, subsetType, subsetId, VOID.TRIPLES, subsetCounter);
488-
report(output, subsetType, subsetId, subsetDistinctType, subsetDistincts);
511+
report(output, subsetType, subsetId, subsetDistinctType, lastSubsetDistincts.size());
512+
report(output, subsetType, subsetId, subsetDistinct2Type, lastSubsetDistincts2.size());
489513
}
490514
subsetCounter = 0;
491-
subsetDistincts = 0;
492515
lastSubsetDistincts = new HashSet<>();
516+
lastSubsetDistincts2 = new HashSet<>(distinct2Threshold);
493517
resetClass(output);
494518
}
495519

0 commit comments

Comments
 (0)