33 so we can test things in the meanwhile
44"""
55# Third party imports
6- import dataclasses
76from datetime import datetime
87import click
98
109# Python imports
1110from typing import List , Tuple
1211from urllib .error import HTTPError
1312import os
13+ from pathlib import Path
1414import sys
15- from c4v import microscope
16- from c4v .classifier .classifier import Labels
1715
1816# Local imports
1917from c4v .scraper .scraped_data_classes .scraped_data import ScrapedData
@@ -38,14 +36,18 @@ def c4v_cli():
3836 Command entry point
3937 """
4038 # init files if necessary:
41- if not os .path .isdir (DEFAULT_FILES_FOLDER ):
42- click .echo (
43- f"[INFO] Creating local files folder at: { DEFAULT_FILES_FOLDER } " , err = True
44- )
39+ path = Path (DEFAULT_FILES_FOLDER )
40+ if not path .exists ():
41+ click .echo (f"[INFO] Creating local files folder at: { DEFAULT_FILES_FOLDER } " )
4542 try :
46- os .mkdir (DEFAULT_FILES_FOLDER )
43+ path .mkdir (parents = True )
4744 except Exception as e :
48- print (e )
45+ print (f"[ERROR] Could not create '{ path } ' folder: { e } " , err = True )
46+ elif not path .is_dir ():
47+ click .echo (
48+ f"[ERROR] Files folder '{ path } ' already exists but it's not a file." ,
49+ err = True ,
50+ )
4951
5052
5153@c4v_cli .command ()
@@ -78,13 +80,14 @@ def scrape(
7880 + loud : bool = if should print scraped data once a scraping is finished\n
7981 """
8082
81- db_manager = SqliteManager (DEFAULT_DB )
82- client = CLIClient (Manager (db_manager ), urls , files )
83+ manager = Manager .from_default ()
84+ client = CLIClient (manager , urls , files )
85+
8386 # Read urls
8487 urls_to_scrape = []
8588
8689 if not urls :
87- urls_to_scrape = [d .url for d in db_manager .get_all (limit , scraped = False )]
90+ urls_to_scrape = [d .url for d in manager .get_all (limit , scraped = False )]
8891 elif files : # if urls are stored in files
8992 urls_to_scrape = client .get_urls (urls )
9093 else :
@@ -165,13 +168,18 @@ def crawl(
165168@click .option ("--urls" , is_flag = True , help = "Only list urls" )
166169@click .option ("--limit" , default = 100 , help = 'List only up to "limit" rows' )
167170@click .option ("--col-len" , default = 50 , help = "Columns max length" )
171+ @click .option ("--count" , is_flag = True , help = "Print only count of selected data" )
168172@click .option (
169173 "--scraped-only" ,
170174 default = None ,
171175 help = "Retrieve only complete rows, those with its scraped data" ,
172176)
173177def list (
174- urls : bool = False , limit : int = 100 , col_len : int = 50 , scraped_only : bool = None
178+ urls : bool = False ,
179+ limit : int = 100 ,
180+ col_len : int = 50 ,
181+ count : bool = False ,
182+ scraped_only : bool = None ,
175183):
176184 """
177185 List requested info as specified by arguments.\n
@@ -183,7 +191,9 @@ def list(
183191 """
184192
185193 scraped_only = (
186- scraped_only == "true" or scraped_only == "True" or scraped_only == "1"
194+ (scraped_only == "true" or scraped_only == "True" or scraped_only == "1" )
195+ if scraped_only
196+ else None
187197 )
188198
189199 db_manager = SqliteManager (DEFAULT_DB )
@@ -194,28 +204,38 @@ def list(
194204 click .echo (data .url )
195205 return
196206
207+ data = [d for d in db_manager .get_all (limit , scraped_only )]
208+
209+ if count :
210+ click .echo (len (data ))
211+ return
212+
197213 # Get printable version of retrieved data
198- data_to_print = data_list_to_table_str (
199- [d for d in db_manager .get_all (limit , scraped_only )], max_cell_len = col_len
200- )
214+ data_to_print = data_list_to_table_str (data , max_cell_len = col_len )
201215
202216 click .echo (data_to_print )
203- print (scraped_only )
204217
205218
206219@c4v_cli .command ()
207220@click .option (
208221 "--no-scrape" , is_flag = True , help = "Don't scrape if url is not found in DB"
209222)
210223@click .option ("--file" , is_flag = True , help = "Get urls of news to classify from a file" )
224+ @click .option (
225+ "--limit" ,
226+ is_flag = False ,
227+ help = "Limit how much instances to classify in this run. Specially usefull when classifying pending data, if less than 0, then select as much as you can (default). Otherwise, classify at the most the given number" ,
228+ type = int ,
229+ )
211230@click .argument ("inputs" , nargs = - 1 )
212- def classify (inputs : List [str ] = [], no_scrape : bool = False , file : bool = False ):
231+ def classify (
232+ inputs : List [str ] = [], no_scrape : bool = False , file : bool = False , limit : int = - 1
233+ ):
213234 """
214235 Run a classification over a given url or from a file, using the model stored in the provided
215236 experiment. Usage:
216237 c4v classify <branch_name>/<experiment_name> <url>
217238 """
218-
219239 # Validate input:
220240 n_args = len (inputs )
221241 if (
@@ -226,8 +246,9 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
226246 )
227247 return
228248
229- manager = Manager .from_local_sqlite_db (DEFAULT_DB )
230- client = CLIClient (manager , inputs [1 :], file )
249+ # Create manager object
250+ manager = Manager .from_default ()
251+ client = CLIClient (manager , file )
231252
232253 # validate branch and name
233254 parsed_branch_and_name = CLIClient .parse_branch_and_experiment_from (inputs [0 ])
@@ -236,8 +257,21 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
236257 else :
237258 branch , experiment = parsed_branch_and_name
238259
239- # Now get data for each url
240- data = client .get_data_for_urls (should_scrape = not no_scrape )
260+ # check if we have to classify pending data
261+ classify_pending = n_args == 2 and inputs [1 ] == "pending"
262+ if classify_pending :
263+ res = manager .run_pending_classification_from_experiment (
264+ branch , experiment , save = True , limit = limit
265+ )
266+ click .echo (f"[INFO] { len (res )} classified rows" )
267+ return
268+
269+ data = client .get_data_for_urls (urls = inputs [1 :], should_scrape = not no_scrape )
270+
271+ # Do nothing if not necessary:
272+ if not data :
273+ click .echo ("[INFO] Nothing to classify" )
274+ return
241275
242276 # Try to classify given data
243277 try :
@@ -247,10 +281,13 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
247281 return
248282
249283 # Pretty print results:
250- for (url , result ) in results .items ():
251- click .echo (f"\t { url } " )
252- for (key , value ) in result .items ():
253- click .echo (f"\t \t * { key } : { value } " )
284+ for result in results :
285+ click .echo ("\n " )
286+ data : ScrapedData = result ["data" ]
287+ scores = result ["scores" ]
288+ click .echo (f"\t { data .title if data .title else '<no title>' } ({ data .url } )" )
289+ click .echo (f"\t \t { data .label } " )
290+ click .echo (f"\t \t { scores } " )
254291
255292
256293@c4v_cli .command ()
@@ -263,7 +300,7 @@ def show(url: str, no_scrape: bool = False):
263300 Show the entire data for a given URL
264301 """
265302 # Create manager object
266- manager = Manager .from_local_sqlite_db ( DEFAULT_DB )
303+ manager = Manager .from_default ( )
267304 client = CLIClient (manager , [url ])
268305
269306 data = client .get_data_for_urls (should_scrape = not no_scrape )
@@ -332,7 +369,7 @@ def explain(
332369 experiment : str = experiment format, following <branch_name>/<experiment_name> format
333370 sentence : str = expression to explain
334371 """
335- microscope_manager = Manager .from_local_sqlite_db ( DEFAULT_DB )
372+ microscope_manager = Manager .from_default ( )
336373 client = CLIClient (microscope_manager )
337374
338375 # Get text to explain
@@ -387,6 +424,106 @@ def explain(
387424 click .echo (f"\t * { word } : { score } " )
388425
389426
427+ @c4v_cli .group ()
428+ def experiment ():
429+ """
430+ Experiment Management. You can get info about experiments with this command, such as
431+ listing, and removing them if no longer necessary
432+ """
433+ path = Path (settings .experiments_dir )
434+ if not path .exists ():
435+ click .echo (f"[INFO] Creating experiments folder in: { path } " )
436+ try :
437+ path .mkdir ()
438+ except Exception as e :
439+ click .echo (
440+ f"[ERROR] Could not create folder due to the following error: { e } " ,
441+ err = True ,
442+ )
443+
444+ elif not path .is_dir ():
445+ click .echo (
446+ f"[ERROR] Could not create folder { path } . File already exists but is not a folder" ,
447+ err = True ,
448+ )
449+
450+
451+ @experiment .command ()
452+ @click .argument ("branch" , nargs = 1 , required = False )
453+ def ls (branch : str = None ):
454+ """
455+ List branches if no argument is provided. If a branch name is specified, then list experiments within that branch.
456+ Examples:
457+ c4v experiment ls
458+ branch1
459+ branch2
460+ branch3
461+ c4v experiment ls branch1
462+ experiment1
463+ experiment2
464+ """
465+ # TODO tal vez mover esta lógica al ExperimentFSManager?
466+ if not branch :
467+ click .echo (f"[INFO] Listing from { settings .experiments_dir } " )
468+ files = CLIClient ._ls_files (settings .experiments_dir )
469+ click .echo ("\n " .join (files ))
470+ return
471+
472+ # Check if branch exists
473+ path = Path (settings .experiments_dir , branch )
474+ if not path .exists ():
475+ click .echo (
476+ f"[ERROR] This is not a valid branch name: { branch } in { path } . You can see available branches using the command:\n \t c4v experiment ls" ,
477+ err = True ,
478+ )
479+ return
480+ elif not path .is_dir ():
481+ click .echo (
482+ f"[ERROR] Invalid Branch path: { path } . The branch name '{ branch } ' does not refers to an actual branch's directory"
483+ )
484+
485+ # As everything is ok, just list files
486+ click .echo (f"[INFO] Listing from { path } " )
487+ files = CLIClient ._ls_files (path = str (path ))
488+ click .echo ("\n " .join (files ))
489+
490+
491+ @experiment .command ()
492+ @click .argument ("experiment" , nargs = 1 )
493+ def summary (experiment : str ):
494+ """
495+ Print summary for an existent experiment given its name with branch and experiment name\n
496+ Example:\n
497+ `c4v experiment summary branch_name/experiment_name`
498+ """
499+
500+ # Parse branch and experiment name
501+ branch_and_experiment = CLIClient .parse_branch_and_experiment_from (experiment )
502+ if not branch_and_experiment :
503+ return
504+
505+ # as everything went ok, parse branch name and experimet
506+ branch_name , experiment_name = branch_and_experiment
507+
508+ path = Path (settings .experiments_dir , branch_name , experiment_name , "summary.txt" )
509+ # Check if file exists
510+ if not path .exists ():
511+ click .echo (
512+ f"[ERROR] Summary for experiment { experiment } not found in { path } " , err = True
513+ )
514+ return
515+ elif not path .is_file ():
516+ click .echo (
517+ f"[ERROR] Sumamry for experiment { experiment } in { path } is not a valid file" ,
518+ err = True ,
519+ )
520+ return
521+
522+ # As everything went ok, print file content
523+ click .echo (f"[INFO] Reading summary from: { path } " )
524+ click .echo (path .read_text ())
525+
526+
390527class CLIClient :
391528 """
392529 This class will manage common operations performed by the CLI tool
@@ -398,7 +535,7 @@ def __init__(
398535
399536 # Default manager
400537 if not manager :
401- manager = Manager .from_local_sqlite_db ( DEFAULT_DB )
538+ manager = Manager .from_default ( local_files_path = settings . c4v_folder )
402539
403540 self ._manager = manager
404541
@@ -423,6 +560,7 @@ def get_data_for_urls(
423560 urls_to_retrieve = urls or self ._urls
424561
425562 # Check scrapable urls:
563+
426564 scrapable_urls , non_scrapables = self ._manager .split_non_scrapable (
427565 urls_to_retrieve
428566 )
@@ -493,7 +631,7 @@ def parse_branch_and_experiment_from(line: str) -> Tuple[str, str]:
493631 return (branch , name )
494632
495633 click .echo (
496- f"[ERROR] Given experiment name is not valid: { line } . Should be in the form:" ,
634+ f"[ERROR] Given experiment name is not valid: { line } . Should be of the form:" ,
497635 err = True ,
498636 )
499637 for separator in separators :
@@ -557,6 +695,13 @@ def _parse_lines_from_files(files: List[str]) -> List[str]:
557695 )
558696 return lines
559697
698+ @staticmethod
699+ def _ls_files (path : str ) -> List [str ]:
700+ """
701+ Returns a list of file names within a given directory
702+ """
703+ return [str (x .name ) for x in Path (path ).glob ("*" )]
704+
560705
561706if __name__ == "__main__" :
562707 c4v_cli ()
0 commit comments