lisbon-opendata · olafveerman · Oct 24, 2013 · Oct 25, 2013 · Oct 26, 2013 · Oct 26, 2013
diff --git a/README.md b/README.md
@@ -8,15 +8,13 @@ The Portuguese census data is published on the [INE website](http://censos.ine.p
 This script was initially built for the [Views on Lisbon](https://github.com/lisbon-opendata/views-on-lisbon) project, but then made more general to be able to use it with any census issue.
 
 ## Limitations
-The first version only processes totals.
+The first version only processes the totals of each indicator. Any sub-totals included in columns are not taken into account for the time being.
 
 ## Usage
 The script needs one argument to work: the question ID.
 
-```bash xxx.sh _$id_```
+```bash xxx.sh -i [3-digit number] -o [output file-name]```
 
 for example:
 
-```bash xxx.sh 605```
-
-## Requirements
+```bash xxx.sh -i 605 -i data-605.csv```
diff --git a/data/nationalities_transpose.py b/data/nationalities_transpose.py
diff --git a/generate_header.py b/generate_header.py
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+# This script takes 
+
+import csv
+import sys
+
+# Open the CSV file
+f = open(sys.argv[1], 'a')
+
+with open(sys.argv[2], 'rb') as ifile:
+    reader = csv.reader(ifile)
+    rows = list(reader)
+    aaid = int(rows[0][2])
+
+    # Determine what structure we're dealing with. Possibilites: 
+    # 1. One sub-category (100)
+    # 2. Two sub-categories (1)
+    # 3. No sub-category (anything else)
+
+    compare_id = int(rows[1][0]) - int(rows[0][0])
+
+    # First column is always admin_area_id
+    f.write('admin_area_id')
+
+    first_cat = True
+
+	# 1. One sub-category
+    if compare_id == 100:
+    	for row in rows:
+    		# We only need the structure of one administrative area
+    		if int(row[2]) != aaid:
+    			break
+    		# On the first row, the category name will always be 'Total'
+    		if first_cat:
+    			f.write(', total')
+    			first_cat = False
+    		else:
+    			# Write each category to a column
+    			f.write(', "' + row[3].strip() + '"')
+
+    # 2. Two sub-categories
+    elif compare_id == 1:
+
+    	for row in rows:
+
+    		# We only need the structure of one administrative area
+    		if int(row[2]) != aaid:
+    			break
+    		# Last number of 'ordem' indicates which level we're dealing with
+    		last_no = (row[0])[-1:]
+
+    		# On the first two rows, the category name will always be 'Total'
+    		if first_cat:
+    			category_name = 'total'
+    			first_cat = False
+    		# Otherwise, if the 'ordem' finishes with one, we're dealing 
+    		# with a top-level category and need to store it.
+    		elif last_no == "1":
+    			category_name = row[3].rsplit(' ', 1)[0].strip()
+
+    		# The sub-category is the last word of the string
+    		sub_cat = (row[3].rsplit(' ', 1)[1])
+
+    		f.write(', "' + category_name + ' - ' + sub_cat + '"')
+
+	# 3. No sub-category
+    else:
+    	# In this case we just print a header 'Total'
+    	f.write(', Total')
+
+    # Add a line ending
+    f.write('\n')
+
+# Done. Close the file.
+f.close
diff --git a/data/process_nationalities.sh → process_census.sh b/data/process_nationalities.sh → process_census.sh
@@ -1,12 +1,14 @@
 #!/bin/bash --posix
 
-# Script for the Views on Lisbon project, processing
+# Script to process data from the Portuguese 2011 Census.
 
 # INSTRUCTIONS
-# $ bash process_nationalities.sh
+# $ bash process_census.sh -i [indicator] -o [output file]
+# Example: bash process_census.sh -i 204 -o data.csv
 
 # OUTPUT
-# A de-normalized CSV file with data about nationalities per freguesia in Portugal
+# A de-normalized CSV file with totals of the indicator per administrative
+# area.
 
 # TODO
 # Improve the source_zip + souce_file vars (not hard-coded)
@@ -25,21 +27,21 @@ error()
 usage()
 {
 	cat >&2 <<-EOF
-		Usage : $0: -u http://www.ine.pt/investigadores/Quadros/Q605.zip -o nationalities_portugal.csv
-			-u source URL
+		Usage : $0 -i 605 -o nationalities_portugal.csv
+			-i indicator
 			-o output file
 			-h help
 	EOF
 }
 
-typeset var_url=""
+typeset var_indicator=""
 typeset var_output=""
 
-while getopts "u:o:h" option
+while getopts "i:o:h" option
 do
 	case $option in
-	u)
-		var_url="$OPTARG"
+	i)
+		var_indicator="$OPTARG"
 		;;
 	o)
 		var_output="$OPTARG"
@@ -57,22 +59,28 @@ done
 
 # check that args not empty
 # //TEMP we could make better checks
-[[ $var_url != "" ]] || { usage; exit 1;}
+[[ $var_indicator != "" ]] || { usage; exit 1;}
 [[ $var_output != "" ]] || { usage; exit 1;}
 
-# check that nationalities_transpose.py is available
-typeset -r cmd_transpose_nationalities="nationalities_transpose.py"
-[[ -f $cmd_transpose_nationalities ]] || error "This script needs $cmd_transpose_nationalities"
+# check that transpose_table.py is available
+typeset -r cmd_transpose_table="transpose_table.py"
+[[ -f $cmd_transpose_table ]] || error "This script needs $cmd_transpose_table"
+
+# check that transpose_table.py is available
+typeset -r cmd_generate_header="generate_header.py"
+[[ -f $cmd_generate_header ]] || error "This script needs $cmd_generate_header"
 
 # The folder should not contain the final file already
 [[ ! -f $var_output ]] || error "It seems you already have a $var_output in this folder. Remove it and run this script again."
 
+# constructing the URL based on the indicator
+typeset -r base_url=http://www.ine.pt/investigadores/Quadros/Q${var_indicator}.zip
 # getting base file name from URL (//TEMP see if it is not better to put it on argument)
-typeset -r base_zip_file_name=${var_url##*/}
+typeset -r base_zip_file_name=Q${var_indicator}.zip
 typeset -r base_file_name=${base_zip_file_name%%.*}
 
 #The sheets in the Excel that need to be processed
-typeset -r sheets=(Q605_NORTE Q605_CENTRO Q605_LISBOA Q605_ALENTEJO Q605_ALGARVE Q605_ACORES Q605_MADEIRA)
+typeset -r sheets=(Q${var_indicator}_NORTE Q${var_indicator}_CENTRO Q${var_indicator}_LISBOA Q${var_indicator}_ALENTEJO Q${var_indicator}_ALGARVE Q${var_indicator}_ACORES Q${var_indicator}_MADEIRA)
 
 #Change Internal Field Separator to new line. Otherwise, it will think spaces in filenames are field separators
 typeset -r IFS=$'\n'
@@ -109,7 +117,7 @@ done
 download_and_unzip()
 {
 	echo "Downloading and unzipping the file..."
-	$cmd_wget "$var_url" || error "$cmd_wget "$var_url""
+	$cmd_wget "$base_url" || error "$cmd_wget "$base_url""
 	$cmd_unzip -q $base_file_name.zip || error "$cmd_unzip -q $base_file_name.zip"
 	# //TEMP see if not better to put a clean option
 	# (if for test we do not want to download everytime the archive)
@@ -161,24 +169,30 @@ for sheet in ${sheets[*]}
 do
 	#csvcut removes the columns with age-specific data. We only need the totals per administrative area
 	#csvgrep removes all the rows that are not related to a freguesia (Identified by a 6 in column 2)
-	$cmd_csvcut -c 2,3,4,5 $sheet.csv | csvgrep -c 1 -m "6" > $sheet-tmp.csv
+	$cmd_csvcut -c 1,2,3,4,5 $sheet.csv | csvgrep -c 2 -m "6" > $sheet-tmp.csv
 	#Remove first line that's empty
 	$cmd_sed -i "1,1d" $sheet-tmp.csv
 	#Do some housekeeping by removing the tmp files.
 	rm $sheet.csv || error "rm $sheet.csv"
 	mv $sheet-tmp.csv $sheet.csv || error "mv $sheet-tmp.csv $sheet.csv"
 done
 
+#Create the file with the final data
+touch $var_output || error "touch $var_output"
+
+elapsed_time=$(($SECONDS - $start_time))
+echo "$elapsed_time seconds. Building the header of the CSV..."
+
+#Build the header of the CSV based on the first sheet in the sheets array
+$cmd_python $cmd_generate_header $var_output $sheets.csv || error "$cmd_python $cmd_generate_header $var_output $sheets.csv"
+
 elapsed_time=$(($SECONDS - $start_time))
 echo "$elapsed_time seconds. About to transpose the data and add it to the final table..."
 
-#Create the file with the final data
-touch $var_output || error "touch $var_output"
-echo 'id,"Total HM", "Total H", "Portugal HM", "Portugal H", "Estrangeira HM", "Estrangeira H", "Europa HM", "Europa H", "União Europeia 27 (S/PT) HM", "União Europeia 27 (S/PT) H", "França HM", "França H", "Países Baixos (Holanda) HM", "Países Baixos (Holanda) H", "Alemanha HM", "Alemanha H", "Itália HM", "Itália H", "Reino Unido HM", "Reino Unido H", "Irlanda HM", "Irlanda H", "Dinamarca HM", "Dinamarca H", "Grécia HM", "Grécia H", "Espanha HM", "Espanha H", "Bélgica HM", "Bélgica H", "Luxemburgo HM", "Luxemburgo H", "Suécia HM", "Suécia H", "Finlândia HM", "Finlândia H", "Áustria HM", "Áustria H", "Malta HM", "Malta H", "Estónia HM", "Estónia H", "Letónia HM", "Letónia H", "Lituânia HM", "Lituânia H", "Polónia HM", "Polónia H", "República Checa HM", "República Checa H", "Eslováquia HM", "Eslováquia H", "Hungria HM", "Hungria H", "Roménia HM", "Roménia H", "Bulgária HM", "Bulgária H", "Eslovénia HM", "Eslovénia H", "Chipre HM", "Chipre H", "Outros países (parcial) HM", "Outros países (parcial) H", "Noruega HM", "Noruega H", "Suíça HM", "Suíça H", "Rússia (Federação da) HM", "Rússia (Federação da) H", "Outros países - Europa HM", "Outros países - Europa H", "África HM", "África H", "África do Sul HM", "África do Sul H", "Angola HM", "Angola H", "Cabo Verde HM", "Cabo Verde H", "Guiné-Bissau HM", "Guiné-Bissau H", "Moçambique HM", "Moçambique H", "São Tomé e Príncipe HM", "São Tomé e Príncipe H", "Outros países - África HM", "Outros países - África H", "América HM", "América H", "Argentina HM", "Argentina H", "Brasil HM", "Brasil H", "Canadá HM", "Canadá H", "Estados Unidos da América HM", "Estados Unidos da América H", "Venezuela, República Bolivariana da HM", "Venezuela, República Bolivariana da H", "Outros país - América HM", "Outros país - América H", "Ásia HM", "Ásia H", "China HM", "China H", "Índia HM", "Índia H", "Japão HM", "Japão H", "Macau HM", "Macau H", "Paquistão HM", "Paquistão H", "Timor Leste HM", "Timor Leste H", "Outros países - Ásia HM", "Outros países - Ásia H", "Oceânia HM", "Oceânia H", "Austrália HM", "Austrália H", "Outros países da Oceânia HM", "Outros países da Oceânia H", "Outros países HM", "Outros países H", "Dupla nacionalidade HM", "Dupla nacionalidade H", "Dupla nacionalidade portuguesa e outra HM", "Dupla nacionalidade portuguesa e outra H", "Dupla nacionalidade estrangeira HM", "Dupla nacionalidade estrangeira H", "Dupla nacionalidade estrangeira, sendo uma da União Europeia HM", "Dupla nacionalidade estrangeira, sendo uma da União Europeia H", "Dupla nacionalidade estrangeira, nenhuma da União Europeia HM", "Dupla nacionalidade estrangeira, nenhuma da União Europeia H", "Apátrida HM", "Apátrida H"' > $var_output
 for sheet in ${sheets[*]}
 do
 	#For every sheet, a python script is called that transposes the data and adds it to the final file
-	$cmd_python $cmd_transpose_nationalities $var_output $sheet.csv || error "$cmd_python $cmd_transpose_nationalities $var_output $sheet.csv"
+	$cmd_python $cmd_transpose_table $var_output $sheet.csv || error "$cmd_python $cmd_transpose_table $var_output $sheet.csv"
 	rm $sheet.csv || error "rm $sheet.csv"
 done
 

diff --git a/transpose_table.py b/transpose_table.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+import csv
+import sys
+
+# Open the CSV file
+f = open(sys.argv[1], 'a')
+
+with open(sys.argv[2], 'rb') as ifile:
+    reader = csv.reader(ifile)
+    admin_area_id = 0
+    for row in reader:
+
+    	# If the ID of the administrative area is different from the previous
+        # we're dealing with a new area
+        if (admin_area_id) != int(row[2]):
+            # Add the ID and the first bit of data
+            f.write('\n' + row[2] + ',' + row[4])
+
+            admin_area_id = int(row[2])
+        # Else, we're dealing with the same administrative area
+        else:
+            # we write the data to the same line
+            f.write(',' + row[4])
+
+# Done. Close the file.
+f.close()