@@ -121,6 +121,7 @@ public final class HalyardStats extends AbstractHalyardTool {
121121
122122 private static final long DEFAULT_GRAPH_THRESHOLD = 1000 ;
123123 private static final long DEFAULT_PARTITION_THRESHOLD = 5000 ;
124+ private static final int DEFAULT_DISTINCT2_THRESHOLD = 1000 ;
124125
125126 enum Counters {
126127 REMOVED_STATEMENTS ,
@@ -185,9 +186,11 @@ static final class StatsMapper extends RdfTableMapper<ImmutableBytesWritable, Lo
185186 Value subsetId ;
186187 Set <Value > lastSubsetIds ;
187188 HashTracker hashTracker ;
188- long subsetDistincts ;
189189 Set <Value > lastSubsetDistincts ;
190190 IRI subsetDistinctType ;
191+ Set <Value > lastSubsetDistincts2 ;
192+ int distinct2Threshold = DEFAULT_DISTINCT2_THRESHOLD ;
193+ IRI subsetDistinct2Type ;
191194 HashTracker subhashTracker ;
192195 long setThreshold , setCounter , subsetThreshold , subsetCounter ;
193196 long instanceOfCounter , classPartitionThreshold ;
@@ -246,20 +249,23 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
246249 subhashLen = index .getRole (TermRole .PREDICATE ).keyHashSize ();
247250 subsetType = VOID_EXT .SUBJECT ;
248251 subsetDistinctType = VOID .PROPERTIES ;
252+ subsetDistinct2Type = VOID .DISTINCT_OBJECTS ;
249253 break ;
250254 case POS :
251255 case CPOS :
252256 hashLen = index .getRole (TermRole .PREDICATE ).keyHashSize ();
253257 subhashLen = index .getRole (TermRole .OBJECT ).keyHashSize ();
254258 subsetType = VOID .PROPERTY ;
255259 subsetDistinctType = VOID .DISTINCT_OBJECTS ;
260+ subsetDistinct2Type = VOID .DISTINCT_SUBJECTS ;
256261 break ;
257262 case OSP :
258263 case COSP :
259264 hashLen = index .getRole (TermRole .OBJECT ).keyHashSize ();
260265 subhashLen = index .getRole (TermRole .SUBJECT ).keyHashSize ();
261266 subsetType = VOID_EXT .OBJECT ;
262267 subsetDistinctType = VOID .DISTINCT_SUBJECTS ;
268+ subsetDistinct2Type = VOID .PROPERTIES ;
263269 break ;
264270 default :
265271 throw new IOException ("Unknown index #" + index );
@@ -273,10 +279,12 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
273279 if (!hashTracker .equals (key )) {
274280 lastSubsetIds = new HashSet <>();
275281 lastSubsetDistincts = new HashSet <>();
282+ lastSubsetDistincts2 = new HashSet <>(distinct2Threshold );
276283 }
277284
278285 if (!subhashTracker .equals (key )) {
279286 lastSubsetDistincts = new HashSet <>();
287+ lastSubsetDistincts2 = new HashSet <>(distinct2Threshold );
280288 }
281289
282290 Statement [] stmts = stmtIndices .parseStatements (null , null , null , null , value , vf );
@@ -320,8 +328,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
320328 subsetId = subj ;
321329 }
322330 IRI pred = stmt .getPredicate ();
323- if (lastSubsetDistincts .add (pred )) {
324- subsetDistincts ++;
331+ lastSubsetDistincts .add (pred );
332+ if (lastSubsetDistincts2 != null ) {
333+ Value obj = stmt .getObject ();
334+ lastSubsetDistincts2 .add (obj );
335+ if (lastSubsetDistincts2 .size () > distinct2Threshold ) {
336+ lastSubsetDistincts2 = null ;
337+ }
325338 }
326339 triples ++;
327340 }
@@ -336,8 +349,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
336349 subsetId = pred ;
337350 }
338351 Value obj = stmt .getObject ();
339- if (lastSubsetDistincts .add (obj )) {
340- subsetDistincts ++;
352+ lastSubsetDistincts .add (obj );
353+ if (lastSubsetDistincts2 != null ) {
354+ Value subj = stmt .getSubject ();
355+ lastSubsetDistincts2 .add (subj );
356+ if (lastSubsetDistincts2 .size () > distinct2Threshold ) {
357+ lastSubsetDistincts2 = null ;
358+ }
341359 }
342360 if (RDF .TYPE .equals (pred )) {
343361 if (!obj .equals (rdfClass )) {
@@ -368,8 +386,13 @@ protected void map(ImmutableBytesWritable rowKey, Result value, Context output)
368386 subsetId = obj ;
369387 }
370388 Value subj = stmt .getSubject ();
371- if (lastSubsetDistincts .add (subj )) {
372- subsetDistincts ++;
389+ lastSubsetDistincts .add (subj );
390+ if (lastSubsetDistincts2 != null ) {
391+ Value pred = stmt .getPredicate ();
392+ lastSubsetDistincts2 .add (pred );
393+ if (lastSubsetDistincts2 .size () > distinct2Threshold ) {
394+ lastSubsetDistincts2 = null ;
395+ }
373396 }
374397 }
375398 break ;
@@ -482,14 +505,15 @@ private void reset(Context output) throws IOException, InterruptedException {
482505 }
483506
484507 private void resetSubset (Context output ) throws IOException , InterruptedException {
485- assert subsetDistincts <= subsetCounter ;
508+ assert lastSubsetDistincts . size () <= subsetCounter ;
486509 if (subsetCounter >= subsetThreshold ) {
487510 report (output , subsetType , subsetId , VOID .TRIPLES , subsetCounter );
488- report (output , subsetType , subsetId , subsetDistinctType , subsetDistincts );
511+ report (output , subsetType , subsetId , subsetDistinctType , lastSubsetDistincts .size ());
512+ report (output , subsetType , subsetId , subsetDistinct2Type , lastSubsetDistincts2 .size ());
489513 }
490514 subsetCounter = 0 ;
491- subsetDistincts = 0 ;
492515 lastSubsetDistincts = new HashSet <>();
516+ lastSubsetDistincts2 = new HashSet <>(distinct2Threshold );
493517 resetClass (output );
494518 }
495519
0 commit comments