Skip to content
This repository was archived by the owner on Dec 19, 2018. It is now read-only.

Commit 7c35f5f

Browse files
author
eberhardtj
committed
init
0 parents  commit 7c35f5f

File tree

59 files changed

+2686
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2686
-0
lines changed

README.adoc

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
= culturegraph-clustering
2+
:TOC:
3+
4+
A implementation of the graph clustering method used in culturegraph.
5+
6+
The input problem is encoded as a bipartite input graph.
7+
A cluster is defined as a connected component in this input graph.
8+
9+
A bipartite graph __G(U,V)__ is represented by two node sets (namely __U__ and __V__).
10+
We represent the bipartite graph with an adjacency list, that lists all
11+
children for each parent (assuming a digraph).
12+
13+
Although the used bipartite graph is undirected, we refer to the node set __U__ with __"Parent Node Set"__ and
14+
to the node set __V__ with __"Child Node Set"__.
15+
16+
== Installation
17+
18+
=== Requirements
19+
20+
- Java 8 or later
21+
22+
=== Build CLI
23+
24+
Run the following command to construct a jar with dependencies.
25+
26+
----
27+
gradlew test fatJar
28+
----
29+
30+
== Command Line Client
31+
32+
=== Example
33+
34+
The input file describes a adjacency list, where each line starts with a parent node, followed by its child nodes.
35+
Parent and child nodes belong to distinct sets of nodes that only share connections between each other.
36+
37+
----
38+
% File: graph.txt
39+
% Schema: PARENT CHILD [... CHILD]
40+
A a b c
41+
B a
42+
C f
43+
----
44+
45+
----
46+
java -Xmx1G -jar cluster-cli-VERSION.jar -i graph.txt -o mapping.txt
47+
----
48+
49+
The resulting mapping tags each parent node with an number.
50+
All nodes that share the same number form a cluster.
51+
52+
----
53+
A 1
54+
B 1
55+
C 2
56+
----
57+
58+
TIP: For large problem instances, you need to tweak the JVM Memory option `-Xmx`.
59+
60+
=== Usage
61+
62+
Usage of the command line client.
63+
64+
```
65+
Usage:
66+
67+
<main class> [-chV] -o=OUTPUT [-s=NUM] INPUT
68+
69+
Description:
70+
71+
Clusters a bipartite-graph by its connected components.
72+
73+
Parameters:
74+
75+
INPUT Input adjacency list (plain text or gzip).
76+
77+
Options:
78+
79+
-o, --output=OUTPUT Output node-cluster-mapping.
80+
-c Compress output with gzip.
81+
-s, --size=NUM Minimum component size.
82+
Default: 0
83+
-h, --help Display this help message.
84+
-V, --version Display version info.
85+
```
86+
87+
== Good To Know
88+
89+
- The input adjacency list should only contain unique lines.
90+
- If a connected component does not reaches the __minimum component size__, each parent node in this component is
91+
assigned to the component "__-1__".
92+
93+
=== Temporary Files
94+
95+
The procedure creates the following temporary files during a run.
96+
97+
.Table Temporary Files
98+
|===
99+
|Name | Description
100+
101+
| childNodeHashes.tmp
102+
| A list of uniques hashes for each unique child node label.
103+
104+
| encodedInput.tmp
105+
| A encoded representation of the input adjacency list, where each node label is mapped to a unique numerical identifier.
106+
107+
| encodedParentNodes.tmp
108+
| A label to number mapping for each parent node.
109+
110+
|===

algorithm/build.gradle

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apply plugin: 'java-library'
2+
3+
dependencies {
4+
api project(':graph')
5+
api project(':io')
6+
api project(':label-encoder')
7+
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
8+
implementation group: 'org.slf4j', name: 'slf4j-api', version: '1.7.25'
9+
testImplementation group: 'org.slf4j', name: 'slf4j-simple', version: '1.7.25'
10+
testImplementation group: 'junit', name: 'junit', version: '4.12'
11+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
import java.util.List;
4+
5+
public class AdjacencyListElement
6+
{
7+
private Node node;
8+
private List<Node> neighbourhood;
9+
10+
public AdjacencyListElement(Node node, List<Node> neighbourhood)
11+
{
12+
this.node = node;
13+
this.neighbourhood = neighbourhood;
14+
}
15+
16+
public Node getNode()
17+
{
18+
return node;
19+
}
20+
21+
public List<Node> getNeighbourhood()
22+
{
23+
return neighbourhood;
24+
}
25+
26+
public String asString()
27+
{
28+
return node.toString() + " " + neighbourhood.toString();
29+
}
30+
31+
@Override
32+
public String toString()
33+
{
34+
return asString();
35+
}
36+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public interface Algorithm
4+
{
5+
void run() throws Exception;
6+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public class ClusteredNode extends Node
4+
{
5+
private int cluster;
6+
7+
public ClusteredNode(String label, int cluster)
8+
{
9+
super(label);
10+
this.cluster = cluster;
11+
}
12+
13+
public int getCluster()
14+
{
15+
return cluster;
16+
}
17+
18+
@Override
19+
public String asString()
20+
{
21+
return this.getLabel() + " " + cluster;
22+
}
23+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public class EncodedNode extends Node
4+
{
5+
6+
private int id;
7+
8+
public EncodedNode(String label, int id)
9+
{
10+
super(label);
11+
this.id = id;
12+
}
13+
14+
public int getId()
15+
{
16+
return id;
17+
}
18+
19+
public static EncodedNode parse(String s)
20+
{
21+
String[] pair = s.split(" ", 2);
22+
String label = pair[0];
23+
String idNumber = pair[1];
24+
return new EncodedNode(label, Integer.parseInt(idNumber));
25+
}
26+
27+
@Override
28+
public String asString()
29+
{
30+
return this.getLabel() + " " + id;
31+
}
32+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public class Node
4+
{
5+
private final String label;
6+
7+
public Node(String label)
8+
{
9+
this.label = label;
10+
}
11+
12+
public String getLabel()
13+
{
14+
return label;
15+
}
16+
17+
public EncodedNode encode(int id)
18+
{
19+
return new EncodedNode(label, id);
20+
}
21+
22+
@Override
23+
public String toString()
24+
{
25+
return asString();
26+
}
27+
28+
public String asString()
29+
{
30+
return label;
31+
}
32+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public interface Procedure {
4+
void apply() throws ProcedureException;
5+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public class ProcedureException extends StepException
4+
{
5+
public ProcedureException(String message, Throwable cause)
6+
{
7+
super(message, cause);
8+
}
9+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package org.culturegraph.clustering.algorithm.core;
2+
3+
public interface Step<T>
4+
{
5+
T apply() throws StepException;
6+
}

0 commit comments

Comments
 (0)