feat: new visualizations and dataframe manipulations and local script setup

vancauwe · vancauwe · commit a6979783eba7 · 2025-11-14T09:58:27.000+01:00
diff --git a/src/neo4j-quickstart/README.md b/src/neo4j-quickstart/README.md
@@ -2,16 +2,25 @@
 
 For open pulse project, you will manipulate graph data with Neo4J. The example notebook will guide you on how to download and visualize the graph. 
 
-## Set-up of dependencies
+## Working on Renku with Jupyter Notebook 
+
+This set-up provides jupyter notebook support if this is your preferred working method.
+
+Please see instructions in `2025-hackathon.md` at the root of the repository for more instructions.
+
+## Working locally
 
 We are using uv ([installation instructions here](https://docs.astral.sh/uv/getting-started/installation/)).
 
 1. Create virtual environment: `uv venv`
 2. Activate it: `source .venv/bin/activate`
 3. Get all predefined dependencies from the `uv.lock` file by running the command: `uv sync`
+4. Run the code with `python quickstart.py`
+
+Note: uv and Jupyter Notebook integration is not covered in this set-up so you will need to work with python scripts only.
 
 ## Build docker
 
-Locally: `docker build -f tools/images/Dockerfile -t test .`
+Locally: `docker build -f tools/images/Dockerfile -t neo4j-quickstart .`
 
 Else there is an integrated github CI in the github workflows of this repository.
diff --git a/src/neo4j-quickstart/quickstart.ipynb b/src/neo4j-quickstart/quickstart.ipynb
@@ -59,24 +59,16 @@
    "source": [
     "import neo4j\n",
     "from utils.neo4jdownloader import Neo4JDownloader\n",
-    "#from dotenv import load_dotenv\n",
     "from pathlib import Path\n",
     "import os\n",
     "\n",
-    "#load_dotenv()  # Load environment variables from .env file\n",
-    "\n",
     "def get_downloader():\n",
     "    secrets_dir = Path(\"/secrets\")\n",
     "    NEO4J_URI = (secrets_dir / \"neo4j_uri\").read_text()\n",
     "    NEO4J_USERNAME = (secrets_dir / \"neo4j_user\").read_text()\n",
     "    NEO4J_PASSWORD = (secrets_dir / \"neo4j_password\").read_text()\n",
     "    NEO4J_DATABASE = (secrets_dir / \"neo4j_database\").read_text()\n",
     "\n",
-    "    # NEO4J_URI = os.environ.get(\"NEO4J_URI\")\n",
-    "    # NEO4J_USERNAME = os.environ.get(\"NEO4J_USER\")\n",
-    "    # NEO4J_PASSWORD = os.environ.get(\"NEO4J_PASSWORD\")\n",
-    "    # NEO4J_DATABASE = neo4j_database\n",
-    "\n",
     "    return Neo4JDownloader(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_DATABASE)\n",
     "\n",
     "def extract_data(nodes, relationships):\n",
@@ -117,6 +109,14 @@
     "df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2e0e247e",
+   "metadata": {},
+   "source": [
+    "Let's make a graph for the first 200 nodes of the graph: "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -128,6 +128,57 @@
     "graph = df_to_pydantic_models(df.head(200), relationships)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "43d755ac",
+   "metadata": {},
+   "source": [
+    "Let's see how we can filter our dataframe (as a classic pandas dataframe) to get all information about EPFL or SDSC. We will then use one of these to continue visualizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ede627ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "epfl_pattern = r\"EPFL\"\n",
+    "epfl_df = df[\n",
+    "    df['source'].astype(str).str.contains(epfl_pattern, flags=re.IGNORECASE, na=False) |\n",
+    "    df['target'].astype(str).str.contains(epfl_pattern, flags=re.IGNORECASE, na=False)\n",
+    "]\n",
+    "epfl_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7c8199a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sdsc_pattern = r\"(SwissDataScienceCenter|SDSC)\"\n",
+    "sdsc_df = df[\n",
+    "    df[\"source\"].astype(str).str.contains(sdsc_pattern, flags=re.IGNORECASE, na=False) |\n",
+    "    df[\"target\"].astype(str).str.contains(sdsc_pattern, flags=re.IGNORECASE, na=False)\n",
+    "]\n",
+    "sdsc_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d2ef7c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sdsc_graph = df_to_pydantic_models(sdsc_df, relationships)\n",
+    "epfl_graph = df_to_pydantic_models(epfl_df, relationships)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4f5f2f38",
@@ -164,8 +215,15 @@
    "source": [
     "from utils.visualization import visualize_graph\n",
     "from pathlib import Path\n",
-    "output_path = Path(\"plots/graphs/graph_visualization.png\")\n",
-    "visualize_graph(graph, output_path)"
+    "\n",
+    "output_path = Path(\"plots/graphs/graph_200_visualization.png\")\n",
+    "visualize_graph(graph, output_path)\n",
+    "\n",
+    "output_path = Path(\"plots/graphs/sdsc_graph.png\")\n",
+    "visualize_graph(sdsc_graph, output_path)\n",
+    "\n",
+    "output_path = Path(\"plots/graphs/epfl_graph.png\")\n",
+    "visualize_graph(epfl_graph, output_path)"
    ]
   },
   {
@@ -188,7 +246,25 @@
     "from utils.visualization import visualize_clusters\n",
     "from pathlib import Path\n",
     "output_dir = Path(\"plots/clusters/\")\n",
-    "visualize_clusters(graph, output_dir)"
+    "\n",
+    "cluster_prefix_name = \"200_first_nodes\"\n",
+    "visualize_clusters(graph, output_dir, cluster_prefix_name)\n",
+    "\n",
+    "cluster_prefix_name = \"sdsc\"\n",
+    "visualize_clusters(sdsc_graph, output_dir, cluster_prefix_name)\n",
+    "\n",
+    "cluster_prefix_name = \"epfl\"\n",
+    "visualize_clusters(epfl_graph, output_dir, cluster_prefix_name)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e2d2c1e",
+   "metadata": {},
+   "source": [
+    "### Follow up on this example visualization: \n",
+    "\n",
+    "We can see for EPFL that just a string matching does not manage to find many of the EPFL affiliated repositories. How can we complement with other tools and other approaches to find a better EPFL graph ? Your turn to play around, good luck !"
    ]
   }
  ],
diff --git a/src/neo4j-quickstart/quickstart.py b/src/neo4j-quickstart/quickstart.py
@@ -0,0 +1,139 @@
+import neo4j
+import re
+from pathlib import Path
+import os
+from dotenv import load_dotenv 
+load_dotenv()  # Load environment variables from .env file
+
+from utils.neo4jdownloader import Neo4JDownloader
+from utils.builder_dataframe import neo4j_to_dataframe
+from utils.builder_models import df_to_pydantic_models
+from utils.visualization import visualize_graph
+from utils.visualization import visualize_clusters
+
+# ---------------------------
+# EXTRACT DATA FROM NEO4J
+# ---------------------------
+
+# Define your nodes
+
+nodes = ["user", "repo", "org"]
+
+# Define your relationships (edges)
+
+relationships = {
+    "member_of": {"type1": {"source": "user", "target": "org"}},
+    "owner_of": {
+        "type1": {"source": "user", "target": "repo"},
+        "type2": {"source": "org", "target": "repo"},
+    },
+    "contributor_of": {
+        "type1": {"source": "user", "target": "repo"},
+        "type2": {"source": "org", "target": "repo"},
+    },
+    "parent_of": {
+        "type1": {"source": "repo", "target": "repo"},
+    },
+}
+
+def get_downloader():
+    
+    NEO4J_URI = os.environ.get("NEO4J_URI")
+    NEO4J_USERNAME = os.environ.get("NEO4J_USER")
+    NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")
+    NEO4J_DATABASE = os.environ.get("NEO4J_DATABASE")
+
+    print(NEO4J_URI)
+
+    return Neo4JDownloader(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD, NEO4J_DATABASE)
+
+def extract_data(nodes, relationships):
+    downloader = get_downloader()
+
+    try:
+        nodes_ids, nodes_features = downloader.retrieve_nodes(nodes)
+        edges_indices, edges_attributes = downloader.retrieve_edges(relationships)
+
+        return nodes_ids, nodes_features, edges_indices, edges_attributes
+    finally:
+        downloader.close()
+
+
+nodes_ids, nodes_features, edges_indices, edges_attributes = extract_data(nodes, relationships)
+# example of looking at the output
+# print(nodes_ids["org"])
+# print(nodes_features["org"])
+# print(edges_indices)
+
+# -------------------------------------------
+# MAKE NEO4J DATA INTO A PANDAS DATAFRAME
+# -------------------------------------------
+
+df = neo4j_to_dataframe(nodes_ids, nodes_features, edges_indices, relationships)
+print("Dataframe constructed, shape is :", df.shape)
+
+# -------------------------------------------
+# EXPLORE / FILTER PANDAS DATAFRAME
+# -------------------------------------------
+
+# Define your pattern and filter the dataframe
+
+epfl_pattern = r"EPFL"
+epfl_df = df[
+    df['source'].astype(str).str.contains(epfl_pattern, flags=re.IGNORECASE, na=False) |
+    df['target'].astype(str).str.contains(epfl_pattern, flags=re.IGNORECASE, na=False)
+]
+print(epfl_df.head())
+print(epfl_df.shape)
+
+sdsc_pattern = r"(SwissDataScienceCenter|SDSC)"
+sdsc_df = df[
+    df["source"].astype(str).str.contains(sdsc_pattern, flags=re.IGNORECASE, na=False) |
+    df["target"].astype(str).str.contains(sdsc_pattern, flags=re.IGNORECASE, na=False)
+]
+print(sdsc_df.head())
+print(sdsc_df.shape)
+
+# -----------------------------------------------------------------------
+# FEED YOUR DATAFRAME TO THE PYDANTIC MODELS AND VISUALIZE THE GRAPH
+# -----------------------------------------------------------------------
+
+# From Dataframes to Graphs (via Pydantic)
+graph = df_to_pydantic_models(sdsc_df, relationships)
+sdsc_graph = df_to_pydantic_models(sdsc_df, relationships)
+epfl_graph = df_to_pydantic_models(epfl_df, relationships)
+
+# Full Graphs
+
+output_path = Path("plots/graphs/graph_200_visualization.png")
+visualize_graph(graph, output_path)
+
+output_path = Path("plots/graphs/sdsc_graph.png")
+visualize_graph(sdsc_graph, output_path)
+
+output_path = Path("plots/graphs/epfl_graph.png")
+visualize_graph(epfl_graph, output_path)
+
+# Clusters 
+
+output_dir = Path("plots/clusters/")
+
+cluster_prefix_name = "200_first_nodes"
+visualize_clusters(graph, output_dir, cluster_prefix_name)
+
+cluster_prefix_name = "sdsc"
+visualize_clusters(sdsc_graph, output_dir, cluster_prefix_name)
+
+cluster_prefix_name = "epfl"
+visualize_clusters(epfl_graph, output_dir, cluster_prefix_name)
+
+# -----------------------------------------------------------------------
+# DEMO FOLLOW UP 
+
+# We can see for EPFL that just a string matching does not manage to find many of the EPFL affiliated repositories. 
+# How can we complement with other tools and other approaches to find a better EPFL graph ? 
+# Your turn to play around, good luck !
+
+# -----------------------------------------------------------------------
+
+
diff --git a/src/neo4j-quickstart/utils/visualization.py b/src/neo4j-quickstart/utils/visualization.py