4343from .collection .remotecoll import RemoteCollection
4444from .config import config
4545from .dash import History
46+ from .discovery .paperoni_v2 import PaperoniV2
4647from .display import display , print_field , terminal_width
4748from .fulltext .locate import URL , locate_all
4849from .fulltext .pdf import PDF , CachePolicies , get_pdf
@@ -247,6 +248,7 @@ class Configure:
247248 """Configure the workset."""
248249
249250 n : int
251+ drop_zero : bool = True
250252 clear : bool = False
251253
252254 async def run (self , work : "Work" ):
@@ -255,14 +257,15 @@ async def run(self, work: "Work"):
255257 top = deserialize (
256258 Top [Scored [CommentRec [PaperWorkingSet , float ]]], work_file
257259 )
260+ top .drop_zero = self .drop_zero
258261 if self .clear :
259262 top .entries = []
260263 elif top .n > self .n :
261264 top .entries = list (top )[: self .n ]
262265 top .resort ()
263266 top .n = self .n
264267 else :
265- top = Top (self .n )
268+ top = Top (self .n , drop_zero = self . drop_zero )
266269 work .save (top )
267270 print (f"Configured { work_file .resolve ()} for n={ self .n } " )
268271
@@ -701,8 +704,122 @@ async def run(self, coll: "Coll"):
701704 elif not len (coll .collection ) and not len (await coll .collection .exclusions ()):
702705 logging .warning ("Collection is not empty. Use --force to drop it." )
703706
707+ @dataclass
708+ class Validate :
709+ """Validate the papers in the collection using the paperoni v2 database."""
710+
711+ # The paperoni v2 database
712+ # [optional]
713+ # [metavar v2]
714+ paperoni_v2 : Auto [PaperoniV2 .query ] = None
715+
716+ # Validate papers having a score greater than the threshold
717+ # [metavar FLOAT]
718+ threshold : float = None
719+
720+ async def iterate (
721+ self , coll : "Coll" = None , ** kwargs
722+ ) -> AsyncGenerator [Paper , None ]:
723+ if self .paperoni_v2 is not None :
724+ validated = 0
725+ total = 0
726+ async for paper_v2 in self .paperoni_v2 (** kwargs ):
727+ paper_v2 : Paper
728+ total += 1
729+
730+ if "valid" not in paper_v2 .flags :
731+ continue
732+
733+ validated += 1
734+
735+ yield paper_v2
736+
737+ send (progress = ("Validated v2 papers" , validated , total ))
738+
739+ send (progress = ("Validated v2 papers" , None , total ))
740+
741+ else :
742+ score_threshold = self .threshold or config .autovalidate .score_threshold
743+
744+ async for paper in coll .collection .search ():
745+ paper : Paper
746+
747+ if (
748+ "valid" in paper .flags
749+ or (paper .score or config .focuses .score (paper )) < score_threshold
750+ ):
751+ continue
752+
753+ yield paper
754+
755+ async def run (self , coll : "Coll" ):
756+ ignored = 0
757+ validated = 0
758+ count = 0
759+
760+ async for paper in self .iterate (coll = coll ):
761+ count += 1
762+
763+ if coll_paper := await coll .collection .find_paper (paper ):
764+ if "invalid" in coll_paper .flags :
765+ ignored += 1
766+ continue
767+
768+ validated += 1
769+ coll_paper .flags .add ("valid" )
770+ await coll .collection .edit_paper (coll_paper )
771+
772+ if ignored and ignored != count :
773+ send (progress = ("Ignored papers" , ignored , count ))
774+
775+ send (progress = ("Validated papers" , validated , count ))
776+
777+ @dataclass
778+ class Diff :
779+ """Diff the paper collection and another collection.
780+
781+ The output directory will contain two files:
782+ - missing.json: Papers in the other collection that are not in the current collection
783+ - extra.json: Papers in the current collection that are not in the other collection
784+ """
785+
786+ # The other collection
787+ # [positional]
788+ other_collection_path : str
789+
790+ # Output directory
791+ out : Path
792+
793+ # Format of the output files
794+ # [alias: --fmt]
795+ format : Literal ["json" , "yaml" ] = "json"
796+
797+ async def run (self , coll : "Coll" ):
798+ other_collection = FileCollection (file = Path (self .other_collection_path ))
799+ missings = []
800+ extras = []
801+
802+ async for paper in other_collection .search ():
803+ if not await coll .collection .find_paper (paper ):
804+ missings .append (paper )
805+
806+ self .out .mkdir (exist_ok = True , parents = True )
807+ (self .out / f"missing.{ self .format } " ).unlink (missing_ok = True )
808+ await FileCollection (file = self .out / f"missing.{ self .format } " ).add_papers (
809+ missings
810+ )
811+
812+ async for paper in coll .collection .search ():
813+ if not await other_collection .find_paper (paper ):
814+ extras .append (paper )
815+
816+ (self .out / f"extra.{ self .format } " ).unlink (missing_ok = True )
817+ await FileCollection (file = self .out / f"extra.{ self .format } " ).add_papers (
818+ extras
819+ )
820+
704821 # Command to execute
705- command : TaggedUnion [Search , Import , Export , Drop ]
822+ command : TaggedUnion [Search , Import , Export , Drop , Validate , Diff ]
706823
707824 # Collection dir
708825 # [alias: -c]
0 commit comments