IntelCompH2020
diff --git a/‎.DS_Store
-2 KB b/‎.DS_Store
-2 KB
diff --git a/‎README.md
Lines changed: 62 additions & 2 deletions b/‎README.md
Lines changed: 62 additions & 2 deletions
diff --git a/‎classes.dot
Lines changed: 60 additions & 0 deletions b/‎classes.dot
Lines changed: 60 additions & 0 deletions
diff --git a/‎classes.png
4.4 MB b/‎classes.png
4.4 MB
diff --git a/‎classes_tm2.png b/‎classes_tm2.png
diff --git a/‎lab_menu.yaml renamed to ‎config/lab_menu.yaml b/‎lab_menu.yaml renamed to ‎config/lab_menu.yaml
diff --git a/‎options_menu.yaml renamed to ‎config/options_menu.yaml
Lines changed: 36 additions & 17 deletions b/‎options_menu.yaml renamed to ‎config/options_menu.yaml
Lines changed: 36 additions & 17 deletions
diff --git a/‎parameters.default.yaml renamed to ‎config/parameters.default.yaml
Lines changed: 29 additions & 18 deletions b/‎parameters.default.yaml renamed to ‎config/parameters.default.yaml
Lines changed: 29 additions & 18 deletions
diff --git a/‎config/val_menu.yaml
Lines changed: 73 additions & 0 deletions b/‎config/val_menu.yaml
Lines changed: 73 additions & 0 deletions
@@ -1,2 +1,62 @@
-# GraphAnalysisToolbox
-A collection of tools for graph synthesis, processing and analysis
+# Supergraph
+
+<img src="https://github.com/Orieus/supergraph/blob/master/figures/supergraph.png" width="400">
+
+**Supergraph** is a generic software for the management and processing of a interrelated collection of multiple graphs. 
+
+It can be used to process multiple graphs. Functionality include (but it is not limited to):
+
+1. **Similarity graphs**: generated from node attributes, based on different similarity measures (Jensen-Shannon, Hellinger, L1, L2).
+    * General implementations based on the `neighbors` module from [scikit-learn](https://scikit-learn.org/stable/).
+    * Specific implementation for fast computation of Hellinger distances using [Numba](https://numba.pydata.org/) and cuda.
+2. **Community detection** algorithms (Louvain, Walktrap, FastGreeedy, Label Propagation)
+    * Implementations based on [IGraph](https://igraph.org/python/) and [Networkx](https://networkx.github.io/).
+3. **Bipartite** graphs from attributes
+4. **Transductive graphs**: Graphs generated by connecting target nodes from a bipartite graph. Link weights are computed from the links of a graph connecting the source nodes.
+5. **Transitive graphs**, computed as the composition of two bipartite graphs.
+6. **Analysis of graph partitions**.
+7. **Analysis of graph nodes** (centrality measures, PageRank).
+    * Implementations based on [Networkx](https://networkx.github.io/).
+8. **Edicion tools** for the collection of graphs:
+    * Create, add, remove graphs
+    * Subsampling
+    * Reduction to graphs of equivalence classes
+9. **Tools for visualization**:
+    * Graph layout algorithms.
+    * Exportation to GEXF format
+    * Visualization of bipartite graphs (requires [Halo](https://vizuly.io/product/halo/), not included)
+        
+
+## Usage:
+
+### As an application:
+
+The software includes two applications that can be used to generate and manipulate graphs through an interactive menu:
+
+* `mainRDIgraphs.py`: Provides accces to the sofware functionality through an interative menu. It reads the links to the source data from a configuration file (`parameters.yaml`). You would need to edit this file to use other data.
+* `mainRDIlab.py`: It uses the software functionality to carry out experiments for analysing RDI corpus collections.
+
+Write
+
+    python mainRDIgraphs.py --h
+    python mainRDIlab.py --h
+
+to see the available options.
+   
+### As a sofware package:
+
+The software include several class packages that can be used independently. Classes include (and are not limited to):
+
+   * `SimGraph`: Generation of similarity graphs
+   * `CommunityPlus`: Wrapper to community detection algorithms
+   * `DataGraph` (requires `SimGraph` and `CommunityPlus`): provides tools for graph processing and analysis.
+   * `SuperGraph` (requires `DataGraph`): provides tools for handling collections of DataGraph objects, including tools for the generation of new datagraphs.
+
+### Additional information
+
+You can find more detailed information about this software in the [Wiki](https://github.com/Orieus/supergraph/wiki).
+
+This project was initially conceived for the processing of multiple corpus of scientific publications, patents and project proposals, inside the project "**Service for Identifying Impact and R&D&I Agent Collaboration Networks**" (*Servicio para Identificar Impacto y Redes de Colaboración de Agentes I+D+i*), funded by the **Secretary of State for the Digital Agenda** (SEAD, Secretaría de Estado para la Agenda Digital), under the umbrella of the Spanish Plan for the Stimulus of Language Tecnologies (PTL, [*Plan de Impulso de las Tecnologías del Lenguaje*](https://www.plantl.gob.es/Paginas/index.aspx)).
+
+
+
@@ -64,9 +64,6 @@ load:
 setup: 
   title: Activate configuration file
 
-readData: 
-  title: Read dataset
-
 import_data:
   title: Import data
   options:
@@ -109,12 +106,14 @@ graph_tools:
     - largest_community_subgraph
     - remove_isolated_nodes
     - remove_snode_attributes
+    - disambiguate_node
 
 gInference: 
   title: Graph inference tools
   options:
     - equivalence_graph
     - infer_sim_graph
+    - import_and_infer_sim_graph
     - infer_eq_simgraph
     - infer_sim_bigraph
     - infer_ppr_graph
@@ -193,12 +192,12 @@ display_graphs:
 # Options for import_data
 
 import_snode_from_table:
-  title: Import nodes and edges from table files
+  title: Import nodes and features from table files
   options:
     - get_method: get_names_of_dataset_tables
 
 import_nodes_and_model:
-  title: Import nodes and feature_matrix into a zero-edge graph
+  title: Import nodes from table files and features from npz files
   options:
     - path: topicmodels
 
@@ -245,10 +244,7 @@ import_agents:
 showSDBdata:
   title: Show SQL data sources
   options:
-    - parameters:
-        Pu: Publications
-        Pr: Projects
-        Pa: Patents
+    - get_method: get_names_of_SQL_dbs
 
 manage_Neo4J:
   title: Manage Neo4J database
@@ -304,6 +300,9 @@ remove_snode_attributes:
     - path: graphs
     - get_method: get_attributes
 
+disambiguate_node:
+  title: Disambiguate node
+
 # ######################
 # Options for gInference
 
@@ -313,7 +312,24 @@ equivalence_graph:
     - path: topicmodels
 
 infer_sim_graph:
-  title: 'Similarity graph:                from A_X to A-A'
+  title: 'Similarity graph:                   from A_X to A-A'
+  options:
+    - path: graphs
+    - parameters:
+        He:  "He: 1 minus squared Hellinger's distance (JS) (sklearn-based)"
+        He2: 'He2: self implementation of He (faster)'
+        BC: 'BC: Bhattacharyya coefficient'
+        l1:  'l1: 1 minus l1 distance'
+        JS:  'JS: Jensen-Shannon similarity (too slow)'
+        Gauss: 'Gauss: An exponential function of the squared l2 distance'
+        He->JS: 'He->JS: JS through He and a theoretical bound'
+        He2->JS: 'He2->JS: Same as He->JS, but using implementation He2'
+        l1->JS: 'l1->JS: JS through l1 and a theoretical bound'
+        cosine: 'cosine: Cosine similarity'
+        ncosine: 'ncosine: Normalized cosine similarity (rescaled to [0, 1])' 
+
+import_and_infer_sim_graph:
+  title: 'Import and infer Similarity graph:  from A_X to A-A'
   options:
     - path: topicmodels
     - parameters:
@@ -327,7 +343,7 @@ infer_sim_graph:
         l1->JS: 'l1->JS: JS through l1 and a theoretical bound'
 
 infer_eq_simgraph:
-  title: 'Equivalent Similarity graph:     from A_X to eqA-eqA'
+  title: 'Equivalent Similarity graph:        from A_X to eqA-eqA'
   options:
     - path: topicmodels
     - parameters:
@@ -341,7 +357,7 @@ infer_eq_simgraph:
         l1->JS: 'l1->JS: JS through l1 and a theoretical bound'
 
 infer_sim_bigraph:
-  title: 'Similarity bipartite graph:      from A_X, B_X to A-B'
+  title: 'Similarity bipartite graph:         from A_X, B_X to A-B'
   options:
     - get_method: get_graphs_with_features
     - get_method: get_graphs_with_features
@@ -355,13 +371,13 @@ infer_ppr_graph:
     - path: graphs
 
 inferBGfromA:
-  title: 'Bipartite graph from attributes: from A_B to A->B'
+  title: 'Bipartite graph from attributes:    from A_B to A->B'
   options:
     - path: graphs
     - get_method: get_attributes
 
 transduce:
-  title: 'Transductive graph:              from A-A->B to B-B'
+  title: 'Transductive graph:                 from A-A->B to B-B'
   options:
     - path: bigraphs
     - parameters:
@@ -372,14 +388,14 @@ transduce:
 #   title: 'Similarity bipartite Graph:      from A_X, B_Y to A-B'
 
 inferTransit:
-  title: 'Transitive graph:                from A->B->C to A->C'
+  title: 'Transitive graph:                   from A->B->C to A->C'
   options:
     - path: bigraphs
     - path: bigraphs
 
 
-# ###########################
-# Optionas for display_graphs
+# ##########################
+# Options for display_graphs
 
 graph_layout: 
   title: Graph layout
@@ -401,6 +417,9 @@ show_top_nodes:
     - path: graphs
     - get_method: get_local_features
 
+profile_node:
+  title: Show profile of a given node
+
 # #############################################################################
 # LEVEL 3
 # #############################################################################
 
@@ -34,29 +34,39 @@ validate_all_models:
 # SQL and Graph DataBases
 connections:
   SQL:
-    # Select one and only one db for Pr, Pu and Pa.
+    # Select the databases to be used in the project.
     db_selection:
-      Pr: db_Pr_FECYT
-      # Pa: db_Pa_PATSTATS
-      # Pu: db_Pu_S24Ever    # publicacionesScopus
-      # Co: db_Crunch4Ever
+      # Each selection must have the form:
+      #   label: db_name
+      # where label is just a mnemonic used to identify the database, and
+      # db_name is the name of the database below. For instance, you can
+      # select a different database for projects, patents, publications and
+      # companies as
+      #    Pr: db_name01
+      #    Pa: db_name02
+      #    Pu: db_name03
+      #    Co: db_name04
+      # where db_name01, db_name02, must be the 
     databases:
-      # name of the DB as specified when opening the connection
+      # Here, you can include a complete list of available databases.
+      # Only those included in db_selection (above) will be connected.
+      # The key of each DB is the name of the DB as specified when opening
+      # the connection. For instance:
+      # db_name01:
+      #  category: Pr                           # Type of database
+      #  connector: &sql_con mysql              # Use & to allow dereferencing
+      #  server: &sql_server hal01.tsc.uc3m.es  # Write your server address here
+      #  user: &sql_user username               # Write username here
+      #  password: &sql_password xxxxxxxx       # Write password here
       db_Pr_FECYT:
-        # the "&"s allow referring to ("dereferencing") the corresponding field below (using "*")
+        # the "&"s allow referring to ("dereferencing") the corresponding field
+        # below (using "*")
         category: Pr
         connector: &sql_con mysql
         server: &sql_server hal01.tsc.uc3m.es  # Write your server address here
         user: &sql_user username                  # Write username here
         password: &sql_password xxxxxxxx          # Write password here
       # ----
-      db_Crunch4Ever:
-        category: Pu
-        connector: *sql_con
-        server: localhost
-        user: *sql_user
-        password: *sql_password
-      # ----
       db_Pa_PATSTATS:
         category: Pa
         connector: *sql_con
@@ -95,10 +105,11 @@ connections:
         password: *sql_password
         port: None,
         unix_socket: '/var/run/mysqld/mysqld.sock'
-  neo4j:
-    server: xxxxxxx     # Write server here
-    user: neo4j         # Write username here
-    password: xxxxxx    # Write password here
+  # Uncomment an set neo4j parameters if available
+  # neo4j:
+  #   server: xxxxxxx     # Write server here
+  #   user: neo4j         # Write username here
+  #   password: xxxxxx    # Write password here
 
 # Specify format for the log outputs
 logformat:
 
@@ -0,0 +1,73 @@
+# This file contains the complete list of options in the main script.
+# It must contain at least a root menu with several options, and a description
+# of each option in the root menu.
+# Each option in any menu should have a description
+
+# ROOT MENU
+root:
+  options: 
+    # - create   <Some true options may be missed because the appropriate one
+    # - load      may have been selected by the starting command>
+    - setup
+    - show_SuperGraph
+    - compute_reference_graph
+    - subsample_reference_graph
+    - compute_all_sim_graphs
+    - validate_topic_models
+    - show_validation_results
+    - analyze_variability
+    - show_variability_results
+    - analyze_scalability
+    - show_scalability_results
+    - validate_subtrain_models
+    - show_subtrain_results
+
+# ##########################
+# OPTIONS FROM THE ROOT MENU
+create: 
+  title: Create new project
+  post_opts:
+    - setup
+
+load:
+    title: Load existing project
+
+setup: 
+  title: Activate configuration file
+
+show_SuperGraph:
+  title: Show supergraph structure
+
+compute_reference_graph:
+  title: Compute reference graph
+
+subsample_reference_graph:
+  title: Get reference graph by subsampling a large version
+
+compute_all_sim_graphs:
+  title: Compute all similarity graphs for validation
+
+validate_topic_models:
+  title: Validate topic models using the reference graph
+
+show_validation_results:
+  title: Generate graphical validation results
+
+analyze_variability:
+  title: Validate topic models using semantic variability
+
+show_variability_results:
+  title: Generate graphical result from the variability analysis
+
+analyze_scalability:
+  title: Analize the scalability of graph generatio for validation
+
+show_scalability_results:
+  title: Generate graphical results of the scalability analysis
+
+validate_subtrain_models:
+  title: Validate subtrained models
+
+show_subtrain_results:
+  title: Generar graphical results of the subtrained model validation
+