Skip to content

Commit 7c7c676

Browse files
authored
Merge pull request #93 from code-for-venezuela/luis/metadata
Luis/metadata
2 parents 23a1049 + d58034f commit 7c7c676

File tree

14 files changed

+516
-114
lines changed

14 files changed

+516
-114
lines changed

src/c4v/c4v_cli.py

Lines changed: 177 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,15 @@
33
so we can test things in the meanwhile
44
"""
55
# Third party imports
6-
import dataclasses
76
from datetime import datetime
87
import click
98

109
# Python imports
1110
from typing import List, Tuple
1211
from urllib.error import HTTPError
1312
import os
13+
from pathlib import Path
1414
import sys
15-
from c4v import microscope
16-
from c4v.classifier.classifier import Labels
1715

1816
# Local imports
1917
from c4v.scraper.scraped_data_classes.scraped_data import ScrapedData
@@ -38,14 +36,18 @@ def c4v_cli():
3836
Command entry point
3937
"""
4038
# init files if necessary:
41-
if not os.path.isdir(DEFAULT_FILES_FOLDER):
42-
click.echo(
43-
f"[INFO] Creating local files folder at: {DEFAULT_FILES_FOLDER}", err=True
44-
)
39+
path = Path(DEFAULT_FILES_FOLDER)
40+
if not path.exists():
41+
click.echo(f"[INFO] Creating local files folder at: {DEFAULT_FILES_FOLDER}")
4542
try:
46-
os.mkdir(DEFAULT_FILES_FOLDER)
43+
path.mkdir(parents=True)
4744
except Exception as e:
48-
print(e)
45+
print(f"[ERROR] Could not create '{path}' folder: {e}", err=True)
46+
elif not path.is_dir():
47+
click.echo(
48+
f"[ERROR] Files folder '{path}' already exists but it's not a file.",
49+
err=True,
50+
)
4951

5052

5153
@c4v_cli.command()
@@ -78,13 +80,14 @@ def scrape(
7880
+ loud : bool = if should print scraped data once a scraping is finished\n
7981
"""
8082

81-
db_manager = SqliteManager(DEFAULT_DB)
82-
client = CLIClient(Manager(db_manager), urls, files)
83+
manager = Manager.from_default()
84+
client = CLIClient(manager, urls, files)
85+
8386
# Read urls
8487
urls_to_scrape = []
8588

8689
if not urls:
87-
urls_to_scrape = [d.url for d in db_manager.get_all(limit, scraped=False)]
90+
urls_to_scrape = [d.url for d in manager.get_all(limit, scraped=False)]
8891
elif files: # if urls are stored in files
8992
urls_to_scrape = client.get_urls(urls)
9093
else:
@@ -165,13 +168,18 @@ def crawl(
165168
@click.option("--urls", is_flag=True, help="Only list urls")
166169
@click.option("--limit", default=100, help='List only up to "limit" rows')
167170
@click.option("--col-len", default=50, help="Columns max length")
171+
@click.option("--count", is_flag=True, help="Print only count of selected data")
168172
@click.option(
169173
"--scraped-only",
170174
default=None,
171175
help="Retrieve only complete rows, those with its scraped data",
172176
)
173177
def list(
174-
urls: bool = False, limit: int = 100, col_len: int = 50, scraped_only: bool = None
178+
urls: bool = False,
179+
limit: int = 100,
180+
col_len: int = 50,
181+
count: bool = False,
182+
scraped_only: bool = None,
175183
):
176184
"""
177185
List requested info as specified by arguments.\n
@@ -183,7 +191,9 @@ def list(
183191
"""
184192

185193
scraped_only = (
186-
scraped_only == "true" or scraped_only == "True" or scraped_only == "1"
194+
(scraped_only == "true" or scraped_only == "True" or scraped_only == "1")
195+
if scraped_only
196+
else None
187197
)
188198

189199
db_manager = SqliteManager(DEFAULT_DB)
@@ -194,28 +204,38 @@ def list(
194204
click.echo(data.url)
195205
return
196206

207+
data = [d for d in db_manager.get_all(limit, scraped_only)]
208+
209+
if count:
210+
click.echo(len(data))
211+
return
212+
197213
# Get printable version of retrieved data
198-
data_to_print = data_list_to_table_str(
199-
[d for d in db_manager.get_all(limit, scraped_only)], max_cell_len=col_len
200-
)
214+
data_to_print = data_list_to_table_str(data, max_cell_len=col_len)
201215

202216
click.echo(data_to_print)
203-
print(scraped_only)
204217

205218

206219
@c4v_cli.command()
207220
@click.option(
208221
"--no-scrape", is_flag=True, help="Don't scrape if url is not found in DB"
209222
)
210223
@click.option("--file", is_flag=True, help="Get urls of news to classify from a file")
224+
@click.option(
225+
"--limit",
226+
is_flag=False,
227+
help="Limit how much instances to classify in this run. Specially usefull when classifying pending data, if less than 0, then select as much as you can (default). Otherwise, classify at the most the given number",
228+
type=int,
229+
)
211230
@click.argument("inputs", nargs=-1)
212-
def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False):
231+
def classify(
232+
inputs: List[str] = [], no_scrape: bool = False, file: bool = False, limit: int = -1
233+
):
213234
"""
214235
Run a classification over a given url or from a file, using the model stored in the provided
215236
experiment. Usage:
216237
c4v classify <branch_name>/<experiment_name> <url>
217238
"""
218-
219239
# Validate input:
220240
n_args = len(inputs)
221241
if (
@@ -226,8 +246,9 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
226246
)
227247
return
228248

229-
manager = Manager.from_local_sqlite_db(DEFAULT_DB)
230-
client = CLIClient(manager, inputs[1:], file)
249+
# Create manager object
250+
manager = Manager.from_default()
251+
client = CLIClient(manager, file)
231252

232253
# validate branch and name
233254
parsed_branch_and_name = CLIClient.parse_branch_and_experiment_from(inputs[0])
@@ -236,8 +257,21 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
236257
else:
237258
branch, experiment = parsed_branch_and_name
238259

239-
# Now get data for each url
240-
data = client.get_data_for_urls(should_scrape=not no_scrape)
260+
# check if we have to classify pending data
261+
classify_pending = n_args == 2 and inputs[1] == "pending"
262+
if classify_pending:
263+
res = manager.run_pending_classification_from_experiment(
264+
branch, experiment, save=True, limit=limit
265+
)
266+
click.echo(f"[INFO] {len(res)} classified rows")
267+
return
268+
269+
data = client.get_data_for_urls(urls=inputs[1:], should_scrape=not no_scrape)
270+
271+
# Do nothing if not necessary:
272+
if not data:
273+
click.echo("[INFO] Nothing to classify")
274+
return
241275

242276
# Try to classify given data
243277
try:
@@ -247,10 +281,13 @@ def classify(inputs: List[str] = [], no_scrape: bool = False, file: bool = False
247281
return
248282

249283
# Pretty print results:
250-
for (url, result) in results.items():
251-
click.echo(f"\t{url}")
252-
for (key, value) in result.items():
253-
click.echo(f"\t\t* {key} : {value}")
284+
for result in results:
285+
click.echo("\n")
286+
data: ScrapedData = result["data"]
287+
scores = result["scores"]
288+
click.echo(f"\t{data.title if data.title else '<no title>'} ({data.url})")
289+
click.echo(f"\t\t{data.label}")
290+
click.echo(f"\t\t{scores}")
254291

255292

256293
@c4v_cli.command()
@@ -263,7 +300,7 @@ def show(url: str, no_scrape: bool = False):
263300
Show the entire data for a given URL
264301
"""
265302
# Create manager object
266-
manager = Manager.from_local_sqlite_db(DEFAULT_DB)
303+
manager = Manager.from_default()
267304
client = CLIClient(manager, [url])
268305

269306
data = client.get_data_for_urls(should_scrape=not no_scrape)
@@ -332,7 +369,7 @@ def explain(
332369
experiment : str = experiment format, following <branch_name>/<experiment_name> format
333370
sentence : str = expression to explain
334371
"""
335-
microscope_manager = Manager.from_local_sqlite_db(DEFAULT_DB)
372+
microscope_manager = Manager.from_default()
336373
client = CLIClient(microscope_manager)
337374

338375
# Get text to explain
@@ -387,6 +424,106 @@ def explain(
387424
click.echo(f"\t* {word} : {score}")
388425

389426

427+
@c4v_cli.group()
428+
def experiment():
429+
"""
430+
Experiment Management. You can get info about experiments with this command, such as
431+
listing, and removing them if no longer necessary
432+
"""
433+
path = Path(settings.experiments_dir)
434+
if not path.exists():
435+
click.echo(f"[INFO] Creating experiments folder in: {path}")
436+
try:
437+
path.mkdir()
438+
except Exception as e:
439+
click.echo(
440+
f"[ERROR] Could not create folder due to the following error: {e}",
441+
err=True,
442+
)
443+
444+
elif not path.is_dir():
445+
click.echo(
446+
f"[ERROR] Could not create folder {path}. File already exists but is not a folder",
447+
err=True,
448+
)
449+
450+
451+
@experiment.command()
452+
@click.argument("branch", nargs=1, required=False)
453+
def ls(branch: str = None):
454+
"""
455+
List branches if no argument is provided. If a branch name is specified, then list experiments within that branch.
456+
Examples:
457+
c4v experiment ls
458+
branch1
459+
branch2
460+
branch3
461+
c4v experiment ls branch1
462+
experiment1
463+
experiment2
464+
"""
465+
# TODO tal vez mover esta lógica al ExperimentFSManager?
466+
if not branch:
467+
click.echo(f"[INFO] Listing from {settings.experiments_dir}")
468+
files = CLIClient._ls_files(settings.experiments_dir)
469+
click.echo("\n".join(files))
470+
return
471+
472+
# Check if branch exists
473+
path = Path(settings.experiments_dir, branch)
474+
if not path.exists():
475+
click.echo(
476+
f"[ERROR] This is not a valid branch name: {branch} in {path}. You can see available branches using the command:\n\tc4v experiment ls",
477+
err=True,
478+
)
479+
return
480+
elif not path.is_dir():
481+
click.echo(
482+
f"[ERROR] Invalid Branch path: {path}. The branch name '{branch}' does not refers to an actual branch's directory"
483+
)
484+
485+
# As everything is ok, just list files
486+
click.echo(f"[INFO] Listing from {path}")
487+
files = CLIClient._ls_files(path=str(path))
488+
click.echo("\n".join(files))
489+
490+
491+
@experiment.command()
492+
@click.argument("experiment", nargs=1)
493+
def summary(experiment: str):
494+
"""
495+
Print summary for an existent experiment given its name with branch and experiment name\n
496+
Example:\n
497+
`c4v experiment summary branch_name/experiment_name`
498+
"""
499+
500+
# Parse branch and experiment name
501+
branch_and_experiment = CLIClient.parse_branch_and_experiment_from(experiment)
502+
if not branch_and_experiment:
503+
return
504+
505+
# as everything went ok, parse branch name and experimet
506+
branch_name, experiment_name = branch_and_experiment
507+
508+
path = Path(settings.experiments_dir, branch_name, experiment_name, "summary.txt")
509+
# Check if file exists
510+
if not path.exists():
511+
click.echo(
512+
f"[ERROR] Summary for experiment {experiment} not found in {path}", err=True
513+
)
514+
return
515+
elif not path.is_file():
516+
click.echo(
517+
f"[ERROR] Sumamry for experiment {experiment} in {path} is not a valid file",
518+
err=True,
519+
)
520+
return
521+
522+
# As everything went ok, print file content
523+
click.echo(f"[INFO] Reading summary from: {path}")
524+
click.echo(path.read_text())
525+
526+
390527
class CLIClient:
391528
"""
392529
This class will manage common operations performed by the CLI tool
@@ -398,7 +535,7 @@ def __init__(
398535

399536
# Default manager
400537
if not manager:
401-
manager = Manager.from_local_sqlite_db(DEFAULT_DB)
538+
manager = Manager.from_default(local_files_path=settings.c4v_folder)
402539

403540
self._manager = manager
404541

@@ -423,6 +560,7 @@ def get_data_for_urls(
423560
urls_to_retrieve = urls or self._urls
424561

425562
# Check scrapable urls:
563+
426564
scrapable_urls, non_scrapables = self._manager.split_non_scrapable(
427565
urls_to_retrieve
428566
)
@@ -493,7 +631,7 @@ def parse_branch_and_experiment_from(line: str) -> Tuple[str, str]:
493631
return (branch, name)
494632

495633
click.echo(
496-
f"[ERROR] Given experiment name is not valid: {line}. Should be in the form:",
634+
f"[ERROR] Given experiment name is not valid: {line}. Should be of the form:",
497635
err=True,
498636
)
499637
for separator in separators:
@@ -557,6 +695,13 @@ def _parse_lines_from_files(files: List[str]) -> List[str]:
557695
)
558696
return lines
559697

698+
@staticmethod
699+
def _ls_files(path: str) -> List[str]:
700+
"""
701+
Returns a list of file names within a given directory
702+
"""
703+
return [str(x.name) for x in Path(path).glob("*")]
704+
560705

561706
if __name__ == "__main__":
562707
c4v_cli()

0 commit comments

Comments
 (0)