Skip to content

Commit 997b8a4

Browse files
authored
Merge pull request #1032 from dondi/beta
v6.0.7
2 parents 5030116 + df60678 commit 997b8a4

File tree

19 files changed

+727
-382
lines changed

19 files changed

+727
-382
lines changed

database/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ Here are the files pertaining to both the network and expression databases. Look
2828
From there, create the schemas using the following commands:
2929
3030
```
31-
CREATE SCHEMA spring2022_network;
31+
CREATE SCHEMA gene_regulatory_network;
3232
```
3333
3434
```
35-
CREATE SCHEMA fall2021;
35+
CREATE SCHEMA gene_expression;
3636
```
3737
3838
Once they are created you can exit your database using the command `\q`.

database/expression-database/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ All files pertaining the expression database live within this directory.
66

77
#### Schema
88

9-
All network data is stored within the fall2021 schema on our Postgres database.
9+
All network data is stored within the gene_expression schema on our Postgres database.
1010

11-
The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the fall2021 schema.
11+
The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the gene_expression schema.
1212

1313
Usage:
1414
To load to local database
Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CREATE TABLE fall2021.ref (
1+
CREATE TABLE gene_expression.ref (
22
pubmed_id VARCHAR,
33
authors VARCHAR,
44
publication_year VARCHAR,
@@ -8,18 +8,18 @@ CREATE TABLE fall2021.ref (
88
PRIMARY KEY(ncbi_geo_id, pubmed_id)
99
);
1010

11-
CREATE TABLE fall2021.gene (
11+
CREATE TABLE gene_expression.gene (
1212
gene_id VARCHAR, -- systematic like name
1313
display_gene_id VARCHAR, -- standard like name
1414
species VARCHAR,
1515
taxon_id VARCHAR,
1616
PRIMARY KEY(gene_id, taxon_id)
1717
);
1818

19-
CREATE TABLE fall2021.expression_metadata (
19+
CREATE TABLE gene_expression.expression_metadata (
2020
ncbi_geo_id VARCHAR,
2121
pubmed_id VARCHAR,
22-
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES fall2021.ref(ncbi_geo_id, pubmed_id),
22+
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id),
2323
control_yeast_strain VARCHAR,
2424
treatment_yeast_strain VARCHAR,
2525
control VARCHAR,
@@ -33,10 +33,10 @@ CREATE TABLE fall2021.expression_metadata (
3333
display_expression_table VARCHAR,
3434
PRIMARY KEY(ncbi_geo_id, pubmed_id, time_value)
3535
);
36-
CREATE TABLE fall2021.expression (
36+
CREATE TABLE gene_expression.expression (
3737
gene_id VARCHAR,
3838
taxon_id VARCHAR,
39-
FOREIGN KEY (gene_id, taxon_id) REFERENCES fall2021.gene(gene_id, taxon_id),
39+
FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id),
4040
-- ncbi_geo_id VARCHAR,
4141
-- pubmed_id VARCHAR,
4242
sort_index INT,
@@ -45,27 +45,27 @@ CREATE TABLE fall2021.expression (
4545
time_point FLOAT,
4646
dataset VARCHAR,
4747
PRIMARY KEY(gene_id, sample_id)
48-
-- FOREIGN KEY (ncbi_geo_id, pubmed_id, time_point) REFERENCES fall2021.expression_metadata(ncbi_geo_id, pubmed_id, time_value)
48+
-- FOREIGN KEY (ncbi_geo_id, pubmed_id, time_point) REFERENCES gene_expression.expression_metadata(ncbi_geo_id, pubmed_id, time_value)
4949
);
50-
CREATE TABLE fall2021.degradation_rate (
50+
CREATE TABLE gene_expression.degradation_rate (
5151
gene_id VARCHAR,
5252
taxon_id VARCHAR,
53-
FOREIGN KEY (gene_id, taxon_id) REFERENCES fall2021.gene(gene_id, taxon_id),
53+
FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id),
5454
ncbi_geo_id VARCHAR,
5555
pubmed_id VARCHAR,
56-
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES fall2021.ref(ncbi_geo_id, pubmed_id),
56+
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id),
5757
PRIMARY KEY(gene_id, ncbi_geo_id, pubmed_id),
5858
degradation_rate FLOAT
5959
);
6060

61-
CREATE TABLE fall2021.production_rate (
61+
CREATE TABLE gene_expression.production_rate (
6262
gene_id VARCHAR,
6363
taxon_id VARCHAR,
64-
FOREIGN KEY (gene_id, taxon_id) REFERENCES fall2021.gene(gene_id, taxon_id),
64+
FOREIGN KEY (gene_id, taxon_id) REFERENCES gene_expression.gene(gene_id, taxon_id),
6565
ncbi_geo_id VARCHAR,
6666
pubmed_id VARCHAR,
67-
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES fall2021.ref(ncbi_geo_id, pubmed_id),
67+
FOREIGN KEY (ncbi_geo_id, pubmed_id) REFERENCES gene_expression.ref(ncbi_geo_id, pubmed_id),
6868
PRIMARY KEY(gene_id, ncbi_geo_id, pubmed_id),
6969
production_rate FLOAT
70-
-- FOREIGN KEY (gene_id, ncbi_geo_id, pubmed_id) REFERENCES fall2021.degradation_rate(gene_id, ncbi_geo_id, pubmed_id) -- not sure if we want to link the generated production rate to it's original degradation rate
70+
-- FOREIGN KEY (gene_id, ncbi_geo_id, pubmed_id) REFERENCES gene_expression.degradation_rate(gene_id, ncbi_geo_id, pubmed_id) -- not sure if we want to link the generated production rate to it's original degradation rate
7171
);

database/expression-database/scripts/loader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def convert_int(potential_int):
4545
This program Loads Refs into the database
4646
"""
4747
def LOAD_REFS():
48-
print('COPY fall2021.ref (pubmed_id, authors, publication_year, title, doi, ncbi_geo_id) FROM stdin;')
48+
print('COPY gene_expression.ref (pubmed_id, authors, publication_year, title, doi, ncbi_geo_id) FROM stdin;')
4949
REFS_SOURCE = '../script-results/processed-expression/refs.csv'
5050
with open(REFS_SOURCE, 'r+') as f:
5151
reader = csv.reader(f)
@@ -67,7 +67,7 @@ def LOAD_REFS():
6767
This program Loads ID Mapping into the database
6868
"""
6969
def LOAD_GENES():
70-
print('COPY fall2021.gene (gene_id, display_gene_id, species, taxon_id) FROM stdin;')
70+
print('COPY gene_expression.gene (gene_id, display_gene_id, species, taxon_id) FROM stdin;')
7171
GENE_SOURCE = '../script-results/processed-expression/genes.csv'
7272
with open(GENE_SOURCE, 'r+') as f:
7373
reader = csv.reader(f)
@@ -87,7 +87,7 @@ def LOAD_GENES():
8787
This program Loads Expression Metadata into the database
8888
"""
8989
def LOAD_EXPRESSION_METADATA():
90-
print('COPY fall2021.expression_metadata (ncbi_geo_id, pubmed_id, control_yeast_strain, treatment_yeast_strain, control, treatment, concentration_value, concentration_unit, time_value, time_unit, number_of_replicates, expression_table) FROM stdin;')
90+
print('COPY gene_expression.expression_metadata (ncbi_geo_id, pubmed_id, control_yeast_strain, treatment_yeast_strain, control, treatment, concentration_value, concentration_unit, time_value, time_unit, number_of_replicates, expression_table) FROM stdin;')
9191
EXPRESSION_METADATA_SOURCE = '../script-results/processed-expression/expression-metadata.csv'
9292
with open(EXPRESSION_METADATA_SOURCE, 'r+') as f:
9393
reader = csv.reader(f)
@@ -116,7 +116,7 @@ def LOAD_EXPRESSION_METADATA():
116116
This program Loads Expression Data into the database
117117
"""
118118
def LOAD_EXPRESSION_DATA():
119-
print('COPY fall2021.expression (gene_id, taxon_id, sort_index, sample_id, expression, time_point, dataset) FROM stdin;')
119+
print('COPY gene_expression.expression (gene_id, taxon_id, sort_index, sample_id, expression, time_point, dataset) FROM stdin;')
120120
EXPRESSION_DATA_SOURCE = '../script-results/processed-expression/expression-data.csv'
121121
with open(EXPRESSION_DATA_SOURCE, 'r+') as f:
122122
reader = csv.reader(f)
@@ -140,7 +140,7 @@ def LOAD_EXPRESSION_DATA():
140140
This program Loads Production Rates into the database
141141
"""
142142
def LOAD_PRODUCTION_RATES():
143-
print('COPY fall2021.production_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, production_rate) FROM stdin;')
143+
print('COPY gene_expression.production_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, production_rate) FROM stdin;')
144144
PRODUCTION_RATES_SOURCE = '../script-results/processed-expression/production-rates.csv'
145145
with open(PRODUCTION_RATES_SOURCE, 'r+') as f:
146146
reader = csv.reader(f)
@@ -161,7 +161,7 @@ def LOAD_PRODUCTION_RATES():
161161
This program Loads Degradation Rates into the database
162162
"""
163163
def LOAD_DEGRADATION_RATES():
164-
print('COPY fall2021.degradation_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, degradation_rate) FROM stdin;')
164+
print('COPY gene_expression.degradation_rate (gene_id, taxon_id, ncbi_geo_id, pubmed_id, degradation_rate) FROM stdin;')
165165
DEGRADATION_RATES_SOURCE = '../script-results/processed-expression/degradation-rates.csv'
166166
with open(DEGRADATION_RATES_SOURCE, 'r+') as f:
167167
reader = csv.reader(f)

database/network-database/README.md

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ All files pertaining the network database live within this directory.
66

77
### Schema
88

9-
All network data is stored within the spring2022_network schema on our Postgres database.
9+
All network data is stored within the gene_regulatory_network schema on our Postgres database.
1010

11-
The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the spring2022_network schema.
11+
The schema is located within this directory at the top level in the file `schema.sql`. It defines the tables located within the gene_regulatory_network schema.
1212

1313
Usage:
1414
To load to local database
@@ -32,10 +32,13 @@ Within the scripts directory, there are the following files:
3232

3333
- `generate_network.py`
3434
- `loader.py`
35+
- `generate_new_network_verion.py`
36+
- `loader_updates.py`
3537
- `filter_genes.py`
3638
- `generate_sgd_network_from_yeastract_network.py`
3739

38-
#### Network Generator (and data preprocessor)
40+
41+
#### Network Generator (and data preprocessor) (FOR FRESH DATABASE INSTALLS ONLY)
3942

4043
This script (`generate_network.py`) is a two-for-one. It first uses the yeastmine service from the SGD database to query for all regulator genes relating to Saccharomyces cerevisiae. From there it gets all all of the targets for each regulator gene. We then construct two networks from these connections (a regulator by regulator matrix as well as a regulator by target matrix). We also construct the processed loader files, so that they are ready to load using `loader.py`.
4144

@@ -47,7 +50,7 @@ Usage:
4750
```
4851
python3 generate_network.py
4952
```
50-
#### Database Loader
53+
#### Database Loader (FOR FRESH DATABASE INSTALLS ONLY)
5154

5255
This script (`loader.py`) is to be used to load your preprocessed genes into the database.
5356

@@ -62,6 +65,35 @@ To load to production database
6265
```
6366
python3 loader.py | psql <address to database>
6467
```
68+
#### Network Generator (and data preprocessor) (FOR UPDATES TO EXISTING DATABASE ONLY)
69+
70+
This script (`generate_new_network_verion.py`) is similar to its counterpart `generate_network.py`. It gets all existing genes in the database using the environment variable 'DB_URL'. You can set this environment variable on the terminal right before the command. It uses the yeastmine service from the SGD database to query for all regulator genes relating to Saccharomyces cerevisiae. From there it gets all all of the targets for each regulator gene. We then construct two networks from these connections (a regulator by regulator matrix as well as a regulator by target matrix). We then see if the genes in the newly constructed network have any updates (i.e a gene's standard name was set or a new gene was added to the database). We also construct the processed loader files, so that they are ready to load using `loader_updates.py`.
71+
72+
The resulting network matrices are located in `script-results/networks` and the resulting processed loader files are located within `script-results/processed-loader-files`
73+
74+
Make sure to have all dependencies installed beforehand or you will recieve errors. (pip3 install intermine, tzlocal, etc. [see file for all imports]
75+
76+
Usage:
77+
```
78+
DB_URL="postgresql://[<db_user>:<password>]@<address to database>/<database name>" python3 generate_new_network_version.py
79+
```
80+
#### Database Loader (FOR UPDATES TO EXISTING DATABASE ONLY)
81+
82+
This script (`loader_updates.py`) is to be used to load your preprocessed genes into the database.
83+
84+
This program generates direct SQL statements from the source files generated by the network generator in order to populate a relational database with those files’ data as well as make any needed updates to existing genes within the database. If necessary you will be prompted to enter a password.
85+
86+
Usage:
87+
To load to local database
88+
```
89+
python3 loader_updates.py | psql postgresql://localhost/postgres
90+
```
91+
To load to production database
92+
```
93+
python3 loader_updates.py | psql -h <grnsight database link> -U <user> <database name>
94+
95+
```
96+
6597

6698
#### Filter Genes (beta functionality, not tested)
6799

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
1-
CREATE TABLE spring2022_network.source (
1+
CREATE TABLE gene_regulatory_network.source (
22
time_stamp TIMESTAMP WITH TIME ZONE,
33
source VARCHAR,
4-
source_display_name VARCHAR,
4+
display_name VARCHAR,
55
PRIMARY KEY(time_stamp, source)
66
);
77

8-
CREATE TABLE spring2022_network.gene (
8+
CREATE TABLE gene_regulatory_network.gene (
99
gene_id VARCHAR, -- systematic like name
1010
display_gene_id VARCHAR, -- standard like name
1111
species VARCHAR,
1212
taxon_id VARCHAR,
1313
regulator BOOLEAN,
1414
PRIMARY KEY(gene_id, taxon_id)
1515
);
16-
CREATE TABLE spring2022_network.network (
16+
CREATE TABLE gene_regulatory_network.network (
1717
regulator_gene_id VARCHAR,
1818
target_gene_id VARCHAR,
1919
taxon_id VARCHAR,
2020
time_stamp TIMESTAMP WITH TIME ZONE,
2121
source VARCHAR,
22-
FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES spring2022_network.gene(gene_id, taxon_id),
23-
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES spring2022_network.gene(gene_id, taxon_id),
24-
FOREIGN KEY (time_stamp, source) REFERENCES spring2022_network.source(time_stamp, source)
22+
FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
23+
FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network.gene(gene_id, taxon_id),
24+
FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network.source(time_stamp, source)
2525
);

database/network-database/scripts/filter_genes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
port="5432",
1414
database="postgres")
1515
cursor = connection.cursor()
16-
postgreSQL_select_Query = "select * from spring2022_network.gene"
16+
postgreSQL_select_Query = "select * from gene_regulatory_network.gene"
1717

1818
cursor.execute(postgreSQL_select_Query)
1919
print("Selecting rows from gene table using cursor.fetchall")

database/network-database/scripts/generate_network.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def create_regulator_to_target_row(target, all_regulators):
140140
# Source Table
141141

142142
SOURCE_DESTINATION = '../script-results/processed-loader-files/source.csv'
143-
timestamp = datetime.datetime.now(datetime.timezone.utc)
143+
timestamp = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
144144

145145
source = "YeastMine - Saccharomyces Genome Database"
146146
display_name = "Yeastmine - SGD"

0 commit comments

Comments
 (0)