code-for-venezuela
diff --git a/‎.github/workflows/ci/logger.py‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/ci/logger.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎.github/workflows/ci/quality_check.py‎
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/ci/quality_check.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/ci/stage.py‎
Lines changed: 73 additions & 0 deletions b/‎.github/workflows/ci/stage.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎.github/workflows/publish_docs.yml‎
Lines changed: 56 additions & 0 deletions b/‎.github/workflows/publish_docs.yml‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/docs/development/architecture.md‎
Lines changed: 94 additions & 0 deletions b/‎docs/docs/development/architecture.md‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎docs/docs/development/creating-a-crawler.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/docs/development/creating-a-crawler.md‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,18 @@
+import logging
+
+# create logger
+logger = logging.getLogger("microscope_doc_ci_logging")
+logger.setLevel(logging.DEBUG)
+
+# create console handler and set level to debug
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+
+# create formatter
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
+# add formatter to ch
+ch.setFormatter(formatter)
+
+# add ch to logger
+logger.addHandler(ch)
@@ -0,0 +1,10 @@
+import sys
+from logger import logger
+
+mkdocs_build_output = sys.argv[1]
+if mkdocs_build_output != str(0):
+    logger.error('Error building mkdocs. Warnings were found.')
+    raise Exception('Error building mkdocs. Warnings were found.')
+else:
+    logger.info('No warnings found building mkdocs.')
+    
@@ -0,0 +1,73 @@
+'''
+    This script provides functionality for staging tags in documents.
+    It will remove documents sections enclosed with "##STAGING##" tags (without quotes).  
+    This is intended to run before the building process of the documentation.
+'''
+
+import os
+from typing import List
+from logger import logger
+
+# Only process files and folders within
+FOLDERS = ('docs_en', 'docs_es')
+ACTIVE_FILE_EXT = ('.md')
+START_PATH = './docs/docs/'
+STAGE_TAG = '## stage ##\n'
+
+def get_stage_tags_positions(list_of_elems, element):
+    '''Returns the indexes of all occurrences of give element in
+    the list- listOfElements'''
+    index_pos_list = []
+    index_pos = 0
+    while True:
+        try:
+            # Search for item in list from indexPos to the end of list
+            index_pos = list_of_elems.index(element, index_pos)
+            # Add the index position in list
+            index_pos_list.append(index_pos)
+            index_pos += 1
+        except ValueError as e:
+            break
+    return index_pos_list
+
+def sanitize_text(stage_tag_positions: List[int], lines: List[str]):
+    '''Returns the text lines that are not enclosed in ## stage ## tags'''
+    stage_tags_pairs = zip(stage_tag_positions[::2], stage_tag_positions[1::2])
+    c = 0
+    for p1, p2 in stage_tags_pairs:
+        del lines[p1:p2+1]
+        c += 1
+    logger.info(f'Total sanitized entries {c}')
+    return lines    
+
+def sanitize_file(root, file_name: str):
+    file = os.path.join(root, file_name)
+    f = open(file, 'r')
+    lines = f.readlines()
+    f.close()
+    stage_tags_linear = get_stage_tags_positions(lines, STAGE_TAG)
+    logger.info(f'Sanitizing file {file}...')
+    if len(stage_tags_linear) == 0:
+        logger.info('No staging sections to remove')
+        return    
+    newText = sanitize_text(stage_tags_linear, lines)
+    f = open(file, 'w')
+    f.writelines(newText)
+    f.close()
+    
+def is_markdown_file(filename):
+    return filename.endswith(ACTIVE_FILE_EXT)
+
+def process_files(top_tuple):
+    for topfolder, subfolder, filesintop in top_tuple:
+        """Process files in topfolder first"""
+        for file in filesintop:
+            if is_markdown_file(file):
+                sanitize_file(topfolder, file)
+        """Process subfolders"""
+        for folder in subfolder:
+            folder_walk = os.walk(os.path.join(topfolder, folder))
+            process_files(folder_walk)
+
+root_folders_walk = [ walk for walk in os.walk(START_PATH) if walk[0].endswith(FOLDERS) ]
+process_files(root_folders_walk)
@@ -0,0 +1,56 @@
+# This workflow will update the online documentation so that it matches the one we have written in mkdocs, it 
+# will be triggered by pushes to master. 
+# It will publish everything by pushing the new site to the gh-pages branch.
+
+name: c4v-py-docs
+
+on:
+  push:
+    branches:
+      - master # for testing, change to master when this work is ready to PR
+jobs:
+  deploy:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: '3.8'
+          architecture: 'x64'
+
+      - name: Cache dependencies
+        uses: actions/cache@v1
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install -r ./requirements.txt
+
+      - name: Run staging sanitizer
+        run: python ./.github/workflows/ci/stage.py
+        
+      - run: |
+          pushd docs
+          mkdocs build
+          popd
+          # cp ./CNAME ./docs/site/CNAME
+          # cp ./.nojekyll ./docs/site/.nojekyll
+      
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+            personal_token: ${{ secrets.DOCS_ACCESS_TOKEN }}
+            publish_dir: ./docs/site
+            publish_branch: docs 
+            commit_message: "Update from commit: ${{ github.sha }} - ${{ github.event.head_commit.message }}"
+            allow_empty_commit: false
+            user_name: devops-c4v
+            user_email: [email protected]
+
@@ -7,7 +7,7 @@
 > Solving Venezuela pressing matters one commmit at a time
 
 `c4v-py` is a library used to address Venezuela's pressing issues
-using computer and data science.
+using computer and data science. Check the [online documentation](https://code-for-venezuela.github.io/c4v-py/)
 
 - [Installation](#installation)
 - [Development](#development)
 
@@ -0,0 +1,94 @@
+# Architecture & components
+
+The Microscope library is compound by components that can be summarized as:    
+
+* **Scraper**: Will scrape data from **known** urls for specific websites, **not every website might be scrapable**, 
+returning `ScrapedData` instances, this is the scheme for the data expected from a page
+* **Crawler**: Will crawl new urls from specific sources, this data should be fed to the scraper at some point
+* **Persistency Manager**: Will store data scraped by the scraper in some persistent storage, an SQLite-based
+manager is provided by default
+* **Classifier**: Classifies a `ScrapedData` instance telling if it is a public service problem or not.
+* **Experiment**: This class controls an experiment run, it's useful to manage logging and results for experiments. Also, 
+it makes possible for every experiment to be ran in more or less the same way, making it easier to use for new comers.
+* **ExperimentFSManager**: Simple class controlling how to experiment's filesystems are stored, enabling an unified filesystem 
+for every experiment. You can implement a new object with the same interface if you want to provide an alternative method 
+experiment's storage
+
+<p align="center">
+  <img src= "../../img/microscope_architecture.png">
+</p>
+
+!!! Warning
+    The **classifier** should be more specific in the future, it should be able not only to differentiate between news talking 
+    about public services or not, but also the kind of problem itself
+
+---
+# Scraper
+The **Scraper** component is just a **single function** that receives a list of urls to scrape and manages to select
+the right **scraper object** for such url (based on its domain) or **raise an error** if it's not able to find any **matching scraper**.
+
+## Example usage
+The next examples will show you how to use the scraper to scrape a list of urls, handle a possible non-valid url
+and filter out urls that may not be scrapable.
+### Scraping multiple urls with the Manager object
+The easiest way to scrape is using the manager object as follows:
+```
+import c4v.microscope as ms
+
+# Creates the default manager
+m = ms.Manager.from_default()
+
+urls = [
+    "https://primicia.com.ve/mas/servicios/siete-trucos-caseros-para-limpiar-la-plancha-de-ropa/",
+    "https://primicia.com.ve/guayana/ciudad/suenan-con-urbanismo-en-core-8/"
+]
+
+# Output may depend on your internet connection and page availability
+for result in m.scrape(urls):
+    print(result.pretty_repr(max_content_len = 100))
+
+```
+### Scraping a single url
+```
+import c4v.microscope as ms
+
+m = ms.Manager.from_default()
+
+url = "https://primicia.com.ve/mas/servicios/siete-trucos-caseros-para-limpiar-la-plancha-de-ropa/"
+
+# Output may depend on your internet connection and page availability
+result = m.scrape(url)
+print(result.pretty_repr(max_content_len = 100))
+```
+
+### Removing non-scrapable urls
+Here we can see how to separate scrapable urls from non-scrapable ones. It may be helpful to know which urls can be processed
+```
+import c4v.microscope as ms
+
+m = ms.Manager.from_default()
+
+urls = [
+    "https://primicia.com.ve",
+    "https://elpitazo.net",
+    "https://supernotscrapable.com"
+]
+
+assert m.split_non_scrapable(urls) == (urls[:2], urls[2:])
+```
+### TODO
+add more useful examples
+## Creation 
+You can create a new scraper in order to support scraping for new sites. More details about this in ["creating a scraper"](./creating-a-scraper.md)
+# Crawler
+TODO
+## Creation
+You can create a new crawler in order to support exploring new urls for new sites. More details about this in ["creating a crawler"](./creating-a-crawler.md)
+# Persistency Manager
+TODO
+## Creation
+You can create a new `Persistency Manager` object in order to support new ways of storing data. More details about this in ["creating a persistency manager"](./creating-a-persistency-manager.md)
+# Experiment
+TODO
+# ExperimentFSManager
+TODO
@@ -0,0 +1,2 @@
+# Creating a Crawler
+TODO