Skip to content

Commit 1aa1c5b

Browse files
authored
Merge pull request #94 from code-for-venezuela/luis/docs
Luis/docs
2 parents 7c7c676 + d322012 commit 1aa1c5b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1956
-542
lines changed

.github/workflows/ci/logger.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import logging
2+
3+
# create logger
4+
logger = logging.getLogger("microscope_doc_ci_logging")
5+
logger.setLevel(logging.DEBUG)
6+
7+
# create console handler and set level to debug
8+
ch = logging.StreamHandler()
9+
ch.setLevel(logging.DEBUG)
10+
11+
# create formatter
12+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
13+
14+
# add formatter to ch
15+
ch.setFormatter(formatter)
16+
17+
# add ch to logger
18+
logger.addHandler(ch)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import sys
2+
from logger import logger
3+
4+
mkdocs_build_output = sys.argv[1]
5+
if mkdocs_build_output != str(0):
6+
logger.error('Error building mkdocs. Warnings were found.')
7+
raise Exception('Error building mkdocs. Warnings were found.')
8+
else:
9+
logger.info('No warnings found building mkdocs.')
10+

.github/workflows/ci/stage.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
'''
2+
This script provides functionality for staging tags in documents.
3+
It will remove documents sections enclosed with "##STAGING##" tags (without quotes).
4+
This is intended to run before the building process of the documentation.
5+
'''
6+
7+
import os
8+
from typing import List
9+
from logger import logger
10+
11+
# Only process files and folders within
12+
FOLDERS = ('docs_en', 'docs_es')
13+
ACTIVE_FILE_EXT = ('.md')
14+
START_PATH = './docs/docs/'
15+
STAGE_TAG = '## stage ##\n'
16+
17+
def get_stage_tags_positions(list_of_elems, element):
18+
'''Returns the indexes of all occurrences of give element in
19+
the list- listOfElements'''
20+
index_pos_list = []
21+
index_pos = 0
22+
while True:
23+
try:
24+
# Search for item in list from indexPos to the end of list
25+
index_pos = list_of_elems.index(element, index_pos)
26+
# Add the index position in list
27+
index_pos_list.append(index_pos)
28+
index_pos += 1
29+
except ValueError as e:
30+
break
31+
return index_pos_list
32+
33+
def sanitize_text(stage_tag_positions: List[int], lines: List[str]):
34+
'''Returns the text lines that are not enclosed in ## stage ## tags'''
35+
stage_tags_pairs = zip(stage_tag_positions[::2], stage_tag_positions[1::2])
36+
c = 0
37+
for p1, p2 in stage_tags_pairs:
38+
del lines[p1:p2+1]
39+
c += 1
40+
logger.info(f'Total sanitized entries {c}')
41+
return lines
42+
43+
def sanitize_file(root, file_name: str):
44+
file = os.path.join(root, file_name)
45+
f = open(file, 'r')
46+
lines = f.readlines()
47+
f.close()
48+
stage_tags_linear = get_stage_tags_positions(lines, STAGE_TAG)
49+
logger.info(f'Sanitizing file {file}...')
50+
if len(stage_tags_linear) == 0:
51+
logger.info('No staging sections to remove')
52+
return
53+
newText = sanitize_text(stage_tags_linear, lines)
54+
f = open(file, 'w')
55+
f.writelines(newText)
56+
f.close()
57+
58+
def is_markdown_file(filename):
59+
return filename.endswith(ACTIVE_FILE_EXT)
60+
61+
def process_files(top_tuple):
62+
for topfolder, subfolder, filesintop in top_tuple:
63+
"""Process files in topfolder first"""
64+
for file in filesintop:
65+
if is_markdown_file(file):
66+
sanitize_file(topfolder, file)
67+
"""Process subfolders"""
68+
for folder in subfolder:
69+
folder_walk = os.walk(os.path.join(topfolder, folder))
70+
process_files(folder_walk)
71+
72+
root_folders_walk = [ walk for walk in os.walk(START_PATH) if walk[0].endswith(FOLDERS) ]
73+
process_files(root_folders_walk)

.github/workflows/publish_docs.yml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# This workflow will update the online documentation so that it matches the one we have written in mkdocs, it
2+
# will be triggered by pushes to master.
3+
# It will publish everything by pushing the new site to the gh-pages branch.
4+
5+
name: c4v-py-docs
6+
7+
on:
8+
push:
9+
branches:
10+
- master # for testing, change to master when this work is ready to PR
11+
jobs:
12+
deploy:
13+
runs-on: ubuntu-18.04
14+
steps:
15+
- uses: actions/checkout@v2
16+
17+
- name: Setup Python
18+
uses: actions/setup-python@v1
19+
with:
20+
python-version: '3.8'
21+
architecture: 'x64'
22+
23+
- name: Cache dependencies
24+
uses: actions/cache@v1
25+
with:
26+
path: ~/.cache/pip
27+
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
28+
restore-keys: |
29+
${{ runner.os }}-pip-
30+
31+
- name: Install dependencies
32+
run: |
33+
python3 -m pip install --upgrade pip
34+
python3 -m pip install -r ./requirements.txt
35+
36+
- name: Run staging sanitizer
37+
run: python ./.github/workflows/ci/stage.py
38+
39+
- run: |
40+
pushd docs
41+
mkdocs build
42+
popd
43+
# cp ./CNAME ./docs/site/CNAME
44+
# cp ./.nojekyll ./docs/site/.nojekyll
45+
46+
- name: Deploy
47+
uses: peaceiris/actions-gh-pages@v3
48+
with:
49+
personal_token: ${{ secrets.DOCS_ACCESS_TOKEN }}
50+
publish_dir: ./docs/site
51+
publish_branch: docs
52+
commit_message: "Update from commit: ${{ github.sha }} - ${{ github.event.head_commit.message }}"
53+
allow_empty_commit: false
54+
user_name: devops-c4v
55+
user_email: [email protected]
56+

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
> Solving Venezuela pressing matters one commmit at a time
88
99
`c4v-py` is a library used to address Venezuela's pressing issues
10-
using computer and data science.
10+
using computer and data science. Check the [online documentation](https://code-for-venezuela.github.io/c4v-py/)
1111

1212
- [Installation](#installation)
1313
- [Development](#development)
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Architecture & components
2+
3+
The Microscope library is compound by components that can be summarized as:
4+
5+
* **Scraper**: Will scrape data from **known** urls for specific websites, **not every website might be scrapable**,
6+
returning `ScrapedData` instances, this is the scheme for the data expected from a page
7+
* **Crawler**: Will crawl new urls from specific sources, this data should be fed to the scraper at some point
8+
* **Persistency Manager**: Will store data scraped by the scraper in some persistent storage, an SQLite-based
9+
manager is provided by default
10+
* **Classifier**: Classifies a `ScrapedData` instance telling if it is a public service problem or not.
11+
* **Experiment**: This class controls an experiment run, it's useful to manage logging and results for experiments. Also,
12+
it makes possible for every experiment to be ran in more or less the same way, making it easier to use for new comers.
13+
* **ExperimentFSManager**: Simple class controlling how to experiment's filesystems are stored, enabling an unified filesystem
14+
for every experiment. You can implement a new object with the same interface if you want to provide an alternative method
15+
experiment's storage
16+
17+
<p align="center">
18+
<img src= "../../img/microscope_architecture.png">
19+
</p>
20+
21+
!!! Warning
22+
The **classifier** should be more specific in the future, it should be able not only to differentiate between news talking
23+
about public services or not, but also the kind of problem itself
24+
25+
---
26+
# Scraper
27+
The **Scraper** component is just a **single function** that receives a list of urls to scrape and manages to select
28+
the right **scraper object** for such url (based on its domain) or **raise an error** if it's not able to find any **matching scraper**.
29+
30+
## Example usage
31+
The next examples will show you how to use the scraper to scrape a list of urls, handle a possible non-valid url
32+
and filter out urls that may not be scrapable.
33+
### Scraping multiple urls with the Manager object
34+
The easiest way to scrape is using the manager object as follows:
35+
```
36+
import c4v.microscope as ms
37+
38+
# Creates the default manager
39+
m = ms.Manager.from_default()
40+
41+
urls = [
42+
"https://primicia.com.ve/mas/servicios/siete-trucos-caseros-para-limpiar-la-plancha-de-ropa/",
43+
"https://primicia.com.ve/guayana/ciudad/suenan-con-urbanismo-en-core-8/"
44+
]
45+
46+
# Output may depend on your internet connection and page availability
47+
for result in m.scrape(urls):
48+
print(result.pretty_repr(max_content_len = 100))
49+
50+
```
51+
### Scraping a single url
52+
```
53+
import c4v.microscope as ms
54+
55+
m = ms.Manager.from_default()
56+
57+
url = "https://primicia.com.ve/mas/servicios/siete-trucos-caseros-para-limpiar-la-plancha-de-ropa/"
58+
59+
# Output may depend on your internet connection and page availability
60+
result = m.scrape(url)
61+
print(result.pretty_repr(max_content_len = 100))
62+
```
63+
64+
### Removing non-scrapable urls
65+
Here we can see how to separate scrapable urls from non-scrapable ones. It may be helpful to know which urls can be processed
66+
```
67+
import c4v.microscope as ms
68+
69+
m = ms.Manager.from_default()
70+
71+
urls = [
72+
"https://primicia.com.ve",
73+
"https://elpitazo.net",
74+
"https://supernotscrapable.com"
75+
]
76+
77+
assert m.split_non_scrapable(urls) == (urls[:2], urls[2:])
78+
```
79+
### TODO
80+
add more useful examples
81+
## Creation
82+
You can create a new scraper in order to support scraping for new sites. More details about this in ["creating a scraper"](./creating-a-scraper.md)
83+
# Crawler
84+
TODO
85+
## Creation
86+
You can create a new crawler in order to support exploring new urls for new sites. More details about this in ["creating a crawler"](./creating-a-crawler.md)
87+
# Persistency Manager
88+
TODO
89+
## Creation
90+
You can create a new `Persistency Manager` object in order to support new ways of storing data. More details about this in ["creating a persistency manager"](./creating-a-persistency-manager.md)
91+
# Experiment
92+
TODO
93+
# ExperimentFSManager
94+
TODO
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Creating a Crawler
2+
TODO

0 commit comments

Comments
 (0)