Use rasterio and fiona libraries instead of gdal and ogr libraries (#67)

Dan-Eli · mpelchat04 · commit 1a1476e8ae82 · 2019-03-26T09:15:19.000-04:00
* Start Rasterio migration

* Remove GDAL for rasterio and fiona

* Replace all the calls from GDAL and OGR to fiona and rasterio

* Remove soms dead code and also code cleanup

* Delete unused files

* Adjust the travis tool

* Adjust the write array

* Adjust the Reame and modify unit8 instead of float32 in rasterize

* Update images_to_samples.py

* Update inference.py
diff --git a/.travis.yml b/.travis.yml
@@ -10,7 +10,7 @@ install:
   - conda update -q conda
   - conda info -a
 
-  - conda create -q -n ci_env python=3.6 pytorch-cpu=0.4.0 torchvision ruamel_yaml h5py gdal scikit-image scikit-learn -c pytorch
+  - conda create -q -n ci_env python=3.6 pytorch-cpu=0.4.0 torchvision ruamel_yaml h5py scikit-image scikit-learn fiona rasterio -c pytorch
   - source activate ci_env
 before_script:
   - unzip ./data/massachusetts_buildings.zip -d ./data
diff --git a/README.md b/README.md
@@ -16,7 +16,8 @@ After installing the required computing environment (see next section), one need
     - pytorch 0.4.1
     - torchvision 0.2.1
     - numpy
-    - gdal
+    - rasterio
+    - fiona
     - ruamel_yaml
     - scikit-image
     - scikit-learn
@@ -28,12 +29,12 @@ After installing the required computing environment (see next section), one need
 1. Using conda, you can set and activate your python environment with the following commands:  
     With GPU:
     ```shell
-    conda create -p YOUR_PATH python=3.6 pytorch=0.4.0 torchvision cuda80 ruamel_yaml h5py gdal=2.2.2 scikit-image scikit-learn=0.20 -c pytorch
+    conda create -p YOUR_PATH python=3.6 pytorch=0.4.0 torchvision cuda80 ruamel_yaml h5py fiona rasterio scikit-image scikit-learn=0.20 -c pytorch
     source activate YOUR_ENV
     ```
     CPU only:
     ```shell
-    conda create -p YOUR_PATH python=3.6 pytorch-cpu=0.4.0 torchvision ruamel_yaml h5py gdal=2.2.2 scikit-image scikit-learn=0.20 -c pytorch
+    conda create -p YOUR_PATH python=3.6 pytorch-cpu=0.4.0 torchvision ruamel_yaml h5py fiona rasterio scikit-image scikit-learn=0.20 -c pytorch
     source activate YOUR_ENV
     ```
 1. Set your parameters in the `config.yaml` (see section bellow)
diff --git a/images_to_samples.py b/images_to_samples.py
@@ -1,11 +1,13 @@
 import argparse
-import csv
 import os
 import numpy as np
 import h5py
 import warnings
-from osgeo import gdal, osr, ogr
-from utils import read_parameters, create_new_raster_from_base, assert_band_number, image_reader_as_array, \
+import fiona
+import rasterio
+from rasterio import features
+
+from utils import read_parameters, assert_band_number, image_reader_as_array, \
     create_or_empty_folder, validate_num_classes, read_csv
 
 try:
@@ -53,12 +55,12 @@ def resize_datasets(hdf5_file):
     hdf5_file['map_img'].resize(new_size, axis=0)
 
 
-def samples_preparation(sat_img, ref_img, sample_size, dist_samples, samples_count, num_classes, samples_file, dataset,
-                        background_switch):
+def samples_preparation(in_img_array, label_array, sample_size, dist_samples, samples_count, num_classes, samples_file,
+                        dataset, background_switch):
     """Extract and write samples from input image and reference image
     Args:
-        sat_img: Path and name to the input image
-        ref_img: path and name to the reference image
+        sat_img: num py array of to the input image
+        ref_img: num py array the reference image
         sample_size: Size (in pixel) of the samples to create
         dist_samples: Distance (in pixel) between samples in both images
         samples_count: Current number of samples created (will be appended and return)
@@ -69,8 +71,6 @@ def samples_preparation(sat_img, ref_img, sample_size, dist_samples, samples_cou
     """
 
     # read input and reference images as array
-    in_img_array = image_reader_as_array(sat_img)
-    label_array = image_reader_as_array(ref_img)
 
     h, w, num_bands = in_img_array.shape
 
@@ -110,23 +110,38 @@ def samples_preparation(sat_img, ref_img, sample_size, dist_samples, samples_cou
     return samples_count, num_classes
 
 
-def vector_to_raster(vector_file, attribute_name, new_raster):
+def vector_to_raster(vector_file, input_image, attribute_name):
     """Function to rasterize vector data.
     Args:
         vector_file: Path and name of reference GeoPackage
+        input_image: Path and name of the input raster image
         attribute_name: Attribute containing the pixel value to write
-        new_raster: Raster file where the info will be written
+
+    Return
+        num py array of the burned image
     """
-    source_ds = ogr.Open(vector_file)
-    source_layer = source_ds.GetLayer()
-    name_lyr = source_layer.GetLayerDefn().GetName()
-    rev_lyr = source_ds.ExecuteSQL("SELECT * FROM " + name_lyr + " ORDER BY " + attribute_name + " ASC")
 
-    gdal.RasterizeLayer(new_raster, [1], rev_lyr, options=["ATTRIBUTE=%s" % attribute_name])
+    # Extract vector features to burn in the raster image
+    with fiona.open(vector_file, 'r') as src:
+        lst_vector = [vector for vector in src]
+
+    # Sort feature in order to priorize the burning in the raster image (ex: vegetation before roads...)
+    lst_vector.sort(key=lambda vector : vector['properties'][attribute_name])
+    lst_vector_tuple = [(vector['geometry'], int(vector['properties'][attribute_name])) for vector in lst_vector]
+
+    # Open input raster image to have access to number of rows, column, crs...
+    with rasterio.open(input_image, 'r') as src:
+        burned_raster = rasterio.features.rasterize( (vector_tuple for vector_tuple in lst_vector_tuple),
+                                    fill = 0,
+                                    out_shape=src.shape,
+                                    transform=src.transform,
+                                    dtype=np.uint8)
 
+    return burned_raster
 
-def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv_file, samples_dist,
-         remove_background, mask_input_image, mask_reference):
+
+def main( bucket_name, data_path, samples_size, num_classes, number_of_bands, csv_file, samples_dist,
+          remove_background, mask_input_image, mask_reference):
     gpkg_file = []
     if bucket_name:
         s3 = boto3.resource('s3')
@@ -135,10 +150,8 @@ def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv
         list_data_prep = read_csv('samples_prep.csv')
         if data_path:
             final_samples_folder = os.path.join(data_path, "samples")
-            final_out_label_folder = os.path.join(data_path, "label")
         else:
             final_samples_folder = "samples"
-            final_out_label_folder = "label"
         samples_folder = "samples"
         out_label_folder = "label"
 
@@ -157,17 +170,14 @@ def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv
     val_hdf5 = h5py.File(os.path.join(samples_folder, "val_samples.hdf5"), "w")
 
     trn_hdf5.create_dataset("sat_img", (0, samples_size, samples_size, number_of_bands), np.float32,
-                                maxshape=(None, samples_size, samples_size, number_of_bands))
+                            maxshape=(None, samples_size, samples_size, number_of_bands))
     trn_hdf5.create_dataset("map_img", (0, samples_size, samples_size), np.uint8,
-                                maxshape=(None, samples_size, samples_size))
+                            maxshape=(None, samples_size, samples_size))
     val_hdf5.create_dataset("sat_img", (0, samples_size, samples_size, number_of_bands), np.float32,
-                                maxshape=(None, samples_size, samples_size, number_of_bands))
+                            maxshape=(None, samples_size, samples_size, number_of_bands))
     val_hdf5.create_dataset("map_img", (0, samples_size, samples_size), np.uint8,
-                                maxshape=(None, samples_size, samples_size))
+                            maxshape=(None, samples_size, samples_size))
     for info in list_data_prep:
-        img_name = os.path.basename(info['tif']).split('.')[0]
-        tmp_label_name = os.path.join(out_label_folder, img_name + "_label_tmp.tif")
-        label_name = os.path.join(out_label_folder, img_name + "_label.tif")
 
         if bucket_name:
             bucket.download_file(info['tif'], "Images/" + info['tif'].split('/')[-1])
@@ -178,38 +188,33 @@ def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv
             info['gpkg'] = info['gpkg'].split('/')[-1]
         assert_band_number(info['tif'], number_of_bands)
 
-        value_field = info['attribute_name']
-        validate_num_classes(info['gpkg'], num_classes, value_field)
-
-        # Mask zeros from input image into label raster.
-        if mask_reference:
-            tmp_label_raster = create_new_raster_from_base(info['tif'], tmp_label_name, 1)
-            vector_to_raster(info['gpkg'], info['attribute_name'], tmp_label_raster)
-            tmp_label_raster = None
+        # Read the input raster image
+        np_input_image = image_reader_as_array(info['tif'])
 
-            masked_array = mask_image(image_reader_as_array(info['tif']), image_reader_as_array(tmp_label_name))
-            create_new_raster_from_base(info['tif'], label_name, 1, masked_array)
+        # Validate the number of class in the vector file
+        validate_num_classes(info['gpkg'], num_classes, info['attribute_name'])
 
-            os.remove(tmp_label_name)
+        # Burn vector file in a raster file
+        np_label_raster = vector_to_raster(info['gpkg'], info['tif'], info['attribute_name'])
 
-        else:
-            label_raster = create_new_raster_from_base(info['tif'], label_name, 1)
-            vector_to_raster(info['gpkg'], info['attribute_name'], label_raster)
-            label_raster = None
+        # Mask the zeros from input image into label raster.
+        if mask_reference:
+            np_label_raster = mask_image(np_input_image, np_label_raster)
 
-        # Mask zeros from label raster into input image.
+        # Mask zeros from label raster into input image otherwise use original image
         if mask_input_image:
-            masked_img = mask_image(image_reader_as_array(label_name), image_reader_as_array(info['tif']))
-            create_new_raster_from_base(label_name, info['tif'], number_of_bands, masked_img)
+            np_input_image = mask_image(np_label_raster, np_input_image)
 
         if info['dataset'] == 'trn':
             out_file = trn_hdf5
         elif info['dataset'] == 'val':
             out_file = val_hdf5
 
-        number_samples, number_classes = samples_preparation(info['tif'], label_name, samples_size, samples_dist,
+        np_label_raster = np.reshape(np_label_raster, (np_label_raster.shape[0], np_label_raster.shape[1], 1))
+        number_samples, number_classes = samples_preparation(np_input_image, np_label_raster, samples_size, samples_dist,
                                                              number_samples, number_classes, out_file, info['dataset'],
                                                              remove_background)
+
         print(info['tif'])
         print(number_samples)
         out_file.flush()
@@ -234,6 +239,10 @@ def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv
     args = parser.parse_args()
     params = read_parameters(args.ParamFile)
 
+    import time
+    start_time = time.time()
+
+
     main(params['global']['bucket_name'],
          params['global']['data_path'],
          params['global']['samples_size'],
@@ -244,3 +253,6 @@ def main(bucket_name, data_path, samples_size, num_classes, number_of_bands, csv
          params['sample']['remove_background'],
          params['sample']['mask_input_image'],
          params['sample']['mask_reference'])
+
+    print ("Elapsed time:{}".format(time.time() - start_time))
+
diff --git a/inference.py b/inference.py
@@ -6,10 +6,11 @@
 import time
 import argparse
 import heapq
+import rasterio
 from PIL import Image
 import torchvision
 from models.model_choice import net, maxpool_level
-from utils import read_parameters, create_new_raster_from_base, assert_band_number, load_from_checkpoint, \
+from utils import read_parameters, assert_band_number, load_from_checkpoint, \
     image_reader_as_array, read_csv
 
 try:
@@ -83,7 +84,7 @@ def main(bucket, work_folder, img_list, weights_file_name, model, number_of_band
             print()
         else:
             sem_seg_results = sem_seg_inference(bucket, model, img['tif'], overlay)
-            create_new_raster_from_base(local_img, inference_image, 1, sem_seg_results)
+            create_new_raster_from_base(local_img, inference_image, sem_seg_results)
             print(f"Semantic segmentation of image {img_name} completed")
 
         if bucket:
@@ -102,6 +103,29 @@ def main(bucket, work_folder, img_list, weights_file_name, model, number_of_band
     print('Inference completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
 
 
+def create_new_raster_from_base(input_raster, output_raster, write_array):
+    """Function to use info from input raster to create new one.
+    Args:
+        input_raster: input raster path and name
+        output_raster: raster name and path to be created with info from input
+        write_array (optional): array to write into the new raster
+
+    Return:
+        none
+    """
+
+    with rasterio.open(input_raster, 'r') as src:
+        with rasterio.open( output_raster, 'w',
+                            driver=src.driver,
+                            width=src.width,
+                            height=src.height,
+                            count=1,
+                            crs=src.crs,
+                            dtype=np.uint8,
+                            transform=src.transform) as dst:
+            dst.write(write_array[:,:,0], 1)
+
+
 def sem_seg_inference(bucket, model, image, overlay):
     """Inference on images using semantic segmentation
     Args:
diff --git a/utils.py b/utils.py
@@ -1,13 +1,11 @@
 import torch
 # import torch should be first. Unclear issue, mentionned here: https://github.com/pytorch/pytorch/issues/2083
 import os
-import subprocess
 import numpy as np
-# import matplotlib.pyplot as plt
-import gdal
+import rasterio
 import warnings
 from ruamel_yaml import YAML
-from osgeo import gdal, ogr
+import fiona
 import csv
 try:
     import boto3
@@ -43,35 +41,6 @@ def read_parameters(param_file):
     return params
 
 
-def create_new_raster_from_base(input_raster, output_raster, band_count, write_array=None):
-    """Function to use info from input raster to create new one.
-    Args:
-        input_raster: input raster path and name
-        output_raster: raster name and path to be created with info from input
-        band_count: number of bands in the input raster
-        write_array (optional): array to write into the new raster
-    """
-    input_image = gdal.Open(input_raster)
-    src = input_image
-    cols = src.RasterXSize
-    rows = src.RasterYSize
-    projection = src.GetProjection()
-    geotransform = src.GetGeoTransform()
-
-    new_raster = gdal.GetDriverByName('GTiff').Create(output_raster, cols, rows, band_count, gdal.GDT_Byte)
-    new_raster.SetProjection(projection)
-    new_raster.SetGeoTransform(geotransform)
-
-    for band_num in range(0, band_count):
-        band = new_raster.GetRasterBand(band_num + 1)
-        band.SetNoDataValue(-9999)
-        # Write array if provided. If not, the image is filled with NoDataValues
-        if write_array is not None:
-            band.WriteArray(write_array[:, :, band_num])
-            band.FlushCache()
-    return new_raster
-
-
 def assert_band_number(in_image, band_count_yaml):
     """Verify if provided image has the same number of bands as described in the .yaml
     Args:
@@ -117,36 +86,39 @@ def image_reader_as_array(file_name):
     """Read an image from a file and return a 3d array (h,w,c)
     Args:
         file_name: full file path of the image
-    """
-    raster = gdal.Open(file_name)
-    band_num = raster.RasterCount
-    band = raster.GetRasterBand(1)
-    rows, columns = (band.XSize, band.YSize)
 
-    np_array = np.empty([columns, rows, band_num], dtype=np.float32)
+    Return:
+        numm_py_array of the image read
+    """
 
-    for i in range(0, band_num):
-        band = raster.GetRasterBand(i + 1)
-        arr = band.ReadAsArray()
-        np_array[:, :, i] = arr
+    with rasterio.open(file_name, 'r') as src:
+        np_array = np.empty([src.height, src.width, src.count], dtype=np.float32)
+        for i in range(src.count):
+            band = src.read(i+1)  # Bands starts at 1 in rasterio not 0
+            np_array[:, :, i] = band
 
     return np_array
 
 
 def validate_num_classes(vector_file, num_classes, value_field):
-    """Validate that the number of classes in the .shp corresponds to the expected number
+    """Validate that the number of classes in the vector file corresponds to the expected number
     Args:
         vector_file: full file path of the vector image
         num_classes: number of classes set in config.yaml
         value_field: name of the value field representing the required classes in the vector image file
+
+    Return:
+        None
     """
-    source_ds = ogr.Open(vector_file)
-    source_layer = source_ds.GetLayer()
-    name_lyr = source_layer.GetLayerDefn().GetName()
-    vector_classes = source_ds.ExecuteSQL("SELECT DISTINCT " + value_field + " FROM " + name_lyr).GetFeatureCount()
-    if vector_classes + 1 != num_classes:
-        raise ValueError('The number of classes in the yaml.config (%d) is different than the number of classes in '
-                         'the file %s (%d)' % (num_classes, vector_file, vector_classes))
+
+    distinct_att = set()
+    with fiona.open(vector_file, 'r') as src:
+        for feature in src:
+            distinct_att.add(feature['properties'][value_field])  # Use property of set to store unique values
+
+    if len(distinct_att)+1 != num_classes:
+        raise ValueError('The number of classes in the yaml.config {} is different than the number of classes in '
+                         'the file {} {}'.format (num_classes, vector_file, str(list(distinct_att))))
 
 
 def list_s3_subfolders(bucket, data_path):