Skip to content

Commit 9b80a05

Browse files
skrawczziltoelijahbenizzy
authored
Add experiment UI and plugin (#680)
* requirements.txt added openai migrated for summarization.py draft experiment manager Experiment Manager v1 Update README.md image artifacts components; docstrings added entrypoint added; README completed gif added to README refactor to hamilton.plugins requirements.txt added add GraphConstructionHook to __init__.py with other public hooks; updated ExperimentTracker hook accordingly openai migrated for summarization.py draft experiment manager Experiment Manager v1 image artifacts components; docstrings added entrypoint added; README completed gif added to README refactor to hamilton.plugins (+1 squashed commit) Squashed commits: [cfdf020] requirements.txt added openai migrated for summarization.py draft experiment manager * A few minor fixes/improvements to Experimentation Manager (#681) * Some minor changes to experimentation manager: 1. generalizes the experimentation directory (no longer relies on .dat or .db) 2. Changes the sort order for displaying experiments (recent on top) 3. Changes the `path` result to work for more than just parquet, and with us standardizing it * Adds instructinos for running the example from the example directory --------- Co-authored-by: zilto <tjean@DESKTOP-V6JDCS2> Co-authored-by: Elijah ben Izzy <[email protected]>
1 parent aa8af83 commit 9b80a05

File tree

15 files changed

+1170
-0
lines changed

15 files changed

+1170
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
fastapi
3+
fastui
4+
openai
5+
pypdf
6+
sf-hamilton[visualization]
7+
tiktoken
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import os
2+
import tempfile
3+
from typing import Generator, Union
4+
5+
import tiktoken
6+
from openai import OpenAI
7+
from pypdf import PdfReader
8+
9+
from hamilton.htypes import Collect, Parallelizable
10+
11+
12+
def openai_client() -> OpenAI:
13+
return OpenAI(api_key=os.environ["OPENAI_API_KEY"])
14+
15+
16+
def raw_text(pdf_source: Union[str, bytes, tempfile.SpooledTemporaryFile]) -> str:
17+
"""Takes a filepath to a PDF and returns a string of the PDF's contents
18+
:param pdf_source: the path, or the temporary file, to the PDF.
19+
:return: the text of the PDF.
20+
"""
21+
reader = PdfReader(pdf_source)
22+
_pdf_text = ""
23+
page_number = 0
24+
for page in reader.pages:
25+
page_number += 1
26+
_pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
27+
return _pdf_text
28+
29+
30+
def tokenizer(tokenizer_encoding: str = "cl100k_base") -> tiktoken.core.Encoding:
31+
"""Get OpenAI tokenizer"""
32+
return tiktoken.get_encoding(tokenizer_encoding)
33+
34+
35+
def _create_chunks(
36+
text: str, tokenizer: tiktoken.core.Encoding, max_length: int
37+
) -> Generator[str, None, None]:
38+
"""Return successive chunks of size `max_length` tokens from provided text.
39+
Split a text into smaller chunks of size n, preferably ending at the end of a sentence
40+
"""
41+
tokens = tokenizer.encode(text)
42+
i = 0
43+
while i < len(tokens):
44+
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
45+
j = min(i + int(1.5 * max_length), len(tokens))
46+
while j > i + int(0.5 * max_length):
47+
# Decode the tokens and check for full stop or newline
48+
chunk = tokenizer.decode(tokens[i:j])
49+
if chunk.endswith(".") or chunk.endswith("\n"):
50+
break
51+
j -= 1
52+
# If no end of sentence found, use n tokens as the chunk size
53+
if j == i + int(0.5 * max_length):
54+
j = min(i + max_length, len(tokens))
55+
yield tokens[i:j]
56+
i = j
57+
58+
59+
def chunked_text(
60+
raw_text: str, tokenizer: tiktoken.core.Encoding, max_token_length: int = 800
61+
) -> list[str]:
62+
"""Tokenize text; create chunks of size `max_token_length`;
63+
for each chunk, convert tokens back to text string
64+
"""
65+
_encoded_chunks = _create_chunks(raw_text, tokenizer, max_token_length)
66+
_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
67+
return _decoded_chunks
68+
69+
70+
def chunk_to_summarize(chunked_text: list[str]) -> Parallelizable[str]:
71+
"""Iterate over chunks that didn't have a stored summary"""
72+
for chunk in chunked_text:
73+
yield chunk
74+
75+
76+
def _summarize_text__openai(openai_client: OpenAI, prompt: str, openai_gpt_model: str) -> str:
77+
"""Use OpenAI chat API to ask a model to summarize content contained in a prompt"""
78+
response = openai_client.chat.completions.create(
79+
model=openai_gpt_model, messages=[{"role": "user", "content": prompt}], temperature=0
80+
)
81+
return response.choices[0].message.content
82+
83+
84+
def prompt_to_summarize_chunk() -> str:
85+
"""Base prompt for summarize a chunk of text"""
86+
return f"Extract key points with reasoning into a bulleted format.\n\nContent:{{content}}" # noqa: F541
87+
88+
89+
def chunk_summary(
90+
openai_client: OpenAI,
91+
chunk_to_summarize: str,
92+
prompt_to_summarize_chunk: str,
93+
openai_gpt_model: str,
94+
) -> str:
95+
"""Fill a base prompt with a chunk's content and summarize it;
96+
Store the summary in the chunk object
97+
"""
98+
filled_prompt = prompt_to_summarize_chunk.format(content=chunk_to_summarize)
99+
return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)
100+
101+
102+
def prompt_to_reduce_summaries() -> str:
103+
"""Prompt for a "reduce" operation to summarize a list of summaries into a single text"""
104+
return f"""Write a summary from this collection of key points.
105+
First answer the question in two sentences. Then, highlight the core argument, conclusions and evidence.
106+
User query: {{query}}
107+
The summary should be structured in bulleted lists following the headings Answer, Core Argument, Evidence, and Conclusions.
108+
Key points:\n{{chunks_summary}}\nSummary:\n""" # noqa: F541
109+
110+
111+
def chunk_summary_collection(chunk_summary: Collect[str]) -> list[str]:
112+
"""Collect chunks for which a summary was just computed"""
113+
return chunk_summary
114+
115+
116+
def final_summary(
117+
openai_client: OpenAI,
118+
query: str,
119+
chunk_summary_collection: list[str],
120+
prompt_to_reduce_summaries: str,
121+
openai_gpt_model: str,
122+
) -> str:
123+
"""Concatenate the list of chunk summaries into a single text,fill the prompt template,
124+
and use OpenAI to reduce the content into a single summary;
125+
"""
126+
concatenated_summaries = " ".join(chunk_summary_collection)
127+
filled_prompt = prompt_to_reduce_summaries.format(
128+
query=query, chunks_summary=concatenated_summaries
129+
)
130+
return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)
131+
132+
133+
if __name__ == "__main__":
134+
import summarization
135+
136+
from hamilton import driver
137+
138+
dr = (
139+
driver.Builder()
140+
.enable_dynamic_execution(allow_experimental_mode=True)
141+
.with_modules(summarization)
142+
.build()
143+
)
144+
dr.display_all_functions("./docs/summary", {"view": False, "format": "png"}, orient="TB")
145+
146+
inputs = dict(
147+
pdf_source="./data/hamilton_paper.pdf",
148+
openai_gpt_model="gpt-3.5-turbo-0613",
149+
query="What are the main benefits of this tool?",
150+
)
151+
152+
results = dr.execute(["final_summary"], inputs=inputs)
+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Hamilton Experiment Manager
2+
3+
Add a hook to your Hamilton Driver to log runs and visualize artifacts and metadata! The server is built using FastAPI + FastUI allowing to easily integrate the server within your app or extend the UI.
4+
5+
<p align="center">
6+
<img src="./showcase.gif" height=600, width=auto/>
7+
</p>
8+
9+
## Features
10+
- 📝 Track run metadata (config, inputs, code version, etc.)
11+
- 📦 Generate directories to store your run artifacts
12+
- 📡 Launch a local server to view and explore your experiments
13+
14+
## Installation
15+
Use `pip install sf-hamilton[experiments]` to install both the hook and the server with their dependencies
16+
17+
## How to use the ExperimentTracker hook
18+
The `ExperimentTracker` hook can be added to your Hamilton Driver definition to automatically log metadata about the run and materialized results.
19+
20+
1. Create the `ExperimentTracker` hook object:
21+
- `experiment_name`: name to organize related runs. Is used to create directories and displayed in the UI
22+
- `base_directory`: path where the metadata cache and subdirectories to store artifacts will be created. Default is `./experiments`.
23+
2. Create the Hamilton Driver and pass the `ExperimentTracker` to `with_adapters()` method
24+
3. Define materializers for each artifacts you care about. The UI provides rich support for `parquet`, `csv`, and `json`.
25+
- ⚠ Make sure to use relative paths (ones that don't start with `/`) for artifacts to be stored in run directories.
26+
4. Call `dr.materialize()` to launch run for which metadata and artifacts will be tracked.
27+
5. (Optional) Use `dr.visualize_materialization()` to visualize the dataflow and set `output_file_path` with the run directory `tracker_hook.run_directory`
28+
29+
### Running the example
30+
31+
To run in the example directory, do the following:
32+
33+
```bash
34+
cd examples/experiment_management
35+
pip install -r requirements.txt # or use your favorite env manager
36+
python run.py
37+
h_experiments # initialize/run the server
38+
```
39+
40+
Then navigate to `http://localhost:8123` to view the experiment manager! Then you'll want to integrate it into your own workflow.
41+
42+
### Integrating your own
43+
```python
44+
from hamilton import driver
45+
from hamilton.plugins import h_experiments
46+
47+
import my_functions # <- your Hamilton module
48+
49+
50+
# 1. create the hook
51+
tracker_hook = h_experiments.ExperimentTracker(
52+
experiment_name="hello-world",
53+
base_directory="/path/to/experiments",
54+
)
55+
56+
# 2. create driver with modules and hook
57+
dr = (
58+
driver.Builder()
59+
.with_modules(my_functions)
60+
.with_adapters(tracker_hook)
61+
.build()
62+
)
63+
64+
# 3. define materializers (absolute or relative path)
65+
materializers = [
66+
# notice the relative paths (don't start with "/")
67+
to.json(
68+
id="model_performance__json",
69+
dependencies=["model_performance"],
70+
path="./model_performance.json",
71+
),
72+
to.parquet(
73+
id="training_data__parquet",
74+
dependencies=["training_data"],
75+
path="./training_data.parquet",
76+
),
77+
]
78+
79+
# 4. launch run using `.materialize()`
80+
dr.materialize(*materializers)
81+
82+
# 5. (optional) visualize materialization and store the figure
83+
# under the `tracker_hook.run_directory` path
84+
dr.visualize_materialization(
85+
*materializers,
86+
output_file_path=f"{tracker_hook.run_directory}/dag",
87+
)
88+
```
89+
90+
## How to use the experiment server
91+
The experiment server is a local FastAPI server that reads the run metadata cache and mounts the `base_directory` to view and explore results. The frontend uses FastUI to create a React interface from Python.
92+
93+
### Start the FastAPI server
94+
```
95+
h_experiments
96+
```
97+
98+
You should see in the terminal:
99+
```
100+
INFO: Started server process [24113]
101+
INFO: Waiting for application startup.
102+
INFO: Application startup complete.
103+
INFO: Uvicorn running on http://127.0.0.1:8123 (Press CTRL+C to quit)
104+
```
105+
### Set the experiments directory
106+
```
107+
h_experiments $/path/to/base_directory
108+
```
109+
110+
You can use an absolute or relative path. Default is `./experiments`
111+
112+
### Set host and port
113+
```
114+
h_experiments --host $HOST --port $PORT
115+
```
116+
Defaults are `127.0.0.1` and `8123`
117+
118+
## What's next?
119+
Let us know how you find the experiment manager and features you'd like to see! This project is still early/experimental and there are several interesting avenues:
120+
- Materialize artifacts to cloud storage
121+
- User interface to view node-level code diffs
122+
- Performance profiling of runs
123+
- User interface to launch runs
124+
125+
Given this is a FastAPI server, you can easily extend it yourself and mount it as a subroute for your own application!
+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import matplotlib.pyplot as plt
2+
import numpy as np
3+
import pandas as pd
4+
from matplotlib.figure import Figure
5+
from sklearn.base import BaseEstimator, clone
6+
from sklearn.datasets import load_diabetes
7+
from sklearn.decomposition import PCA
8+
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
9+
from sklearn.linear_model import LinearRegression
10+
from sklearn.metrics import mean_squared_error
11+
from sklearn.model_selection import KFold
12+
13+
from hamilton.function_modifiers import config, extract_fields
14+
15+
16+
@extract_fields(dict(X_raw=np.ndarray, y=np.ndarray))
17+
def load_data() -> dict:
18+
X_raw, y = load_diabetes(return_X_y=True)
19+
return dict(X_raw=X_raw, y=y)
20+
21+
22+
def splits(X_raw: np.ndarray, n_splits: int = 3) -> list[tuple]:
23+
fold = KFold(n_splits=n_splits)
24+
return [(train_idx, eval_idx) for train_idx, eval_idx in fold.split(X_raw)]
25+
26+
27+
@config.when_not_in(preprocess=["pca"])
28+
def X__base(X_raw: np.ndarray) -> np.ndarray:
29+
return X_raw
30+
31+
32+
@config.when(preprocess="pca")
33+
def X__pca(X_raw: np.ndarray, n_components: int = 5) -> np.ndarray:
34+
pca = PCA(n_components=n_components)
35+
return pca.fit_transform(X_raw)
36+
37+
38+
@config.when(model="linear")
39+
def base_model__linear() -> BaseEstimator:
40+
return LinearRegression()
41+
42+
43+
@config.when(model="random_forest")
44+
def base_model__random_forest() -> BaseEstimator:
45+
return RandomForestRegressor()
46+
47+
48+
@config.when(model="boosting")
49+
def base_model__boosting() -> BaseEstimator:
50+
return HistGradientBoostingRegressor()
51+
52+
53+
@extract_fields(
54+
dict(
55+
y_pred=np.ndarray,
56+
cv_scores=list,
57+
)
58+
)
59+
def cross_validation(
60+
X: np.ndarray,
61+
y: np.ndarray,
62+
base_model: BaseEstimator,
63+
splits: list[tuple],
64+
) -> dict:
65+
cv_scores = []
66+
all_pred = np.zeros(y.shape[0])
67+
for train_idx, eval_idx in splits:
68+
model = clone(base_model)
69+
70+
X_train, y_train = X[train_idx], y[train_idx]
71+
X_eval, y_eval = X[eval_idx], y[eval_idx]
72+
73+
model.fit(X_train, y_train)
74+
75+
y_eval_pred = model.predict(X_eval)
76+
all_pred[eval_idx] = y_eval_pred
77+
78+
cv_score = mean_squared_error(y_eval, y_eval_pred)
79+
cv_scores.append(cv_score)
80+
81+
return dict(y_pred=all_pred, cv_scores=cv_scores)
82+
83+
84+
def trained_model(
85+
base_model: BaseEstimator,
86+
X: np.ndarray,
87+
y: np.ndarray,
88+
) -> BaseEstimator:
89+
base_model.fit(X, y)
90+
return base_model
91+
92+
93+
def prediction_df(y: np.ndarray, y_pred: np.ndarray) -> pd.DataFrame:
94+
return pd.DataFrame.from_dict(dict(y_true=y, y_pred=y_pred), orient="columns")
95+
96+
97+
def prediction_plot(y: np.ndarray, y_pred: np.ndarray) -> Figure:
98+
fig, ax = plt.subplots()
99+
ax.scatter(y, y_pred)
100+
ax.set_xlabel("True")
101+
ax.set_ylabel("Predicted")
102+
103+
return fig
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
fastparquet
2+
matplotlib
3+
numpy
4+
pandas
5+
pyarrow
6+
scikit-learn
7+
sf-hamilton[experiments,visualization]

0 commit comments

Comments
 (0)