Skip to content

Commit 3e0586f

Browse files
authored
Added cost commands. (#163)
* Added commands to calculate the cost for workflows (GCS backend only).
1 parent 69d2e4a commit 3e0586f

File tree

2 files changed

+170
-1
lines changed

2 files changed

+170
-1
lines changed

README.md

+10
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,16 @@ requires `column`, `curl`, `mail`, and [jq](https://stedolan.github.io/jq/)
9898
#### Update cromwell server
9999
* `update-server`
100100
* Change the cromwell server that new jobs will be submitted to.
101+
102+
#### Get cost for a workflow
103+
* `cost`
104+
* Get the cost for a workflow.
105+
* Will only work for workflows that completed more than 8 hours ago on GCS.
106+
* Requires the `gcp_bq_cost_table.config` configuration file to exist and contain the big query cost table for your organization.
107+
* `cost-detailed`
108+
* Get the cost for a workflow at the task level.
109+
* Will only work for workflows that completed more than 8 hours ago on GCS.
110+
* Requires the `gcp_bq_cost_table.config` configuration file to exist and contain the big query cost table for your organization.
101111

102112
### Features:
103113
* Running `submit` will create a new folder in the `~/.cromshell/${CROMWELL_URL}/` directory named with the cromwell job id of the newly submitted job.

cromshell

+160-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ mkdir -p ${CROMSHELL_CONFIG_DIR}
4444
CROMWELL_SUBMISSIONS_FILE="${CROMSHELL_CONFIG_DIR}/all.workflow.database.tsv"
4545
[[ ! -f ${CROMWELL_SUBMISSIONS_FILE} ]] && echo -e "DATE\tCROMWELL_SERVER\tRUN_ID\tWDL_NAME\tSTATUS\tALIAS" > ${CROMWELL_SUBMISSIONS_FILE}
4646

47+
# Set the GCP big query cost table file:
48+
BQ_COST_TABLE_FILE=${CROMSHELL_CONFIG_DIR}/gcp_bq_cost_table.config
49+
4750
# Update cromshell submissions file if it needs updating:
4851
grep -q 'ALIAS$' ${CROMWELL_SUBMISSIONS_FILE}
4952
r=$?
@@ -207,6 +210,20 @@ function usage()
207210
echo -e " Update cromwell server:"
208211
echo -e " update-server Change which cromwell server jobs will be submitted to."
209212
echo -e ""
213+
echo -e " Get cost for a workflow"
214+
echo -e " cost [workflow-id] [[workflow-id]...] Get the cost for a workflow."
215+
echo -e " Only works for workflows that completed"
216+
echo -e " more than 8 hours ago on GCS."
217+
echo -e " Requires the 'gcp_bq_cost_table.config'"
218+
echo -e " configuration file to exist and contain"
219+
echo -e " the big query cost table for your organization."
220+
echo -e " cost-detailed [workflow-id] [[workflow-id]...] Get the cost for a workflow at the task level."
221+
echo -e " Only works for workflows that completed"
222+
echo -e " more than 8 hours ago on GCS."
223+
echo -e " Requires the 'gcp_bq_cost_table.config'"
224+
echo -e " configuration file to exist and contain"
225+
echo -e " the big query cost table for your organization."
226+
echo -e ""
210227
echo -e "Return values:"
211228
echo -e " 0 SUCCESS"
212229
echo -e " ANYTHING_BUT_ZERO FAILURE/ERROR"
@@ -1471,6 +1488,148 @@ function fetch-all()
14711488
return 0
14721489
}
14731490

1491+
function _cost_helper()
1492+
{
1493+
local id=$1
1494+
local svr=$2
1495+
1496+
turtle
1497+
which bq &>/dev/null
1498+
local r=$?
1499+
[[ ${r} -ne 0 ]] && error "bq does not exist. Must install the big query command-line client." && exit 8
1500+
1501+
# Check for gdate:
1502+
if [[ "$(uname)" == "Darwin" ]] ; then
1503+
which gdate &> /dev/null
1504+
r=$?
1505+
[ $r -ne 0 ] && error "Must have coreutils installed for 'gdate'" && exit 13
1506+
fi
1507+
1508+
[ ! -e ${BQ_COST_TABLE_FILE} ] && error "Big Query cost table file does not exist. Must populate ${BQ_COST_TABLE_FILE} with big query cost table information." && exit 9
1509+
1510+
# Make sure the given ID is actually in our file:
1511+
grep -q "${id}" ${CROMWELL_SUBMISSIONS_FILE}
1512+
r=$?
1513+
[ $r -ne 0 ] && error "Given ID is not in your cromwell submissions file (${CROMWELL_SUBMISSIONS_FILE}): ${id}" && exit 10
1514+
1515+
COST_TABLE=$(head -n1 ${BQ_COST_TABLE_FILE})
1516+
1517+
# Get the time that the workflow finished:
1518+
error "Fetching workflow finish time..."
1519+
tmpMetadata=$( makeTemp )
1520+
curl --compressed -s "${svr}/api/workflows/v1/${id}/metadata?includeKey=workflowProcessingEvents" > ${tmpMetadata}
1521+
1522+
[ ! -s ${tmpMetadata} ] && error "Could not communicate with server. Perhaps try a longer timeout." && exit 15
1523+
1524+
grep -q '"description":"Finished",' ${tmpMetadata}
1525+
r=$?
1526+
[ $r -ne 0 ] && error "Workflow ${id} is not finished yet." && exit 11
1527+
1528+
STARTED_TIME=$( jq '.workflowProcessingEvents | map(select(.description == "PickedUp")) | .[].timestamp' ${tmpMetadata} | tr -d '"')
1529+
FINISHED_TIME=$( jq '.workflowProcessingEvents | map(select(.description == "Finished")) | .[].timestamp' ${tmpMetadata} | tr -d '"')
1530+
1531+
# Make sure that at least 8h have passed since the workflow finished:
1532+
if [[ "$(uname)" == "Darwin" ]] ; then
1533+
local DATE_CMD=gdate
1534+
else
1535+
local DATE_CMD=date
1536+
fi
1537+
1538+
local ts1=$( $DATE_CMD +%s -d "${FINISHED_TIME}" )
1539+
local ts2=$( date +%s )
1540+
local can_check_cost=$( echo "(${ts2} - ${ts1}) >= (3600 * 8)" | bc )
1541+
[ $can_check_cost -ne 1 ] && error "Workflow finished less than 8 hours ago. Cannot check cost. Please try again later." && exit 12
1542+
1543+
# Generate the start and end dates for our query:
1544+
START_DATE=$($DATE_CMD +%Y-%m-%d -d "${STARTED_TIME} -1 day")
1545+
END_DATE=$($DATE_CMD +%Y-%m-%d -d "${FINISHED_TIME} +1 day")
1546+
1547+
error "Using cost table: ${COST_TABLE}"
1548+
error ""
1549+
}
1550+
1551+
# Get the cost for a workflow ID:
1552+
function cost()
1553+
{
1554+
local id=$1
1555+
local svr=$2
1556+
_cost_helper $id $svr
1557+
1558+
local tmp_cost_file=$( makeTemp )
1559+
1560+
# Get the cost from Big Query:
1561+
bq query --use_legacy_sql=false "SELECT sum(cost) FROM \`${COST_TABLE}\`, UNNEST(labels) WHERE value = \"cromwell-${id}\" AND _PARTITIONDATE BETWEEN \"${START_DATE}\" AND \"${END_DATE}\";" > ${tmp_cost_file}
1562+
r=$?
1563+
1564+
# Display the cost:
1565+
total_cost=$( head -n4 ${tmp_cost_file} | tail -n1 | tr -d '| \t')
1566+
[[ "${total_cost}" == NULL ]] && error "Could not retrieve cost - no cost entries found." && exit 14
1567+
echo -n '$'
1568+
echo "scale=2;${total_cost}/1" | bc
1569+
1570+
error ""
1571+
error "Costs rounded to nearest cent (approximately)."
1572+
error ""
1573+
error "WARNING: Costs here DO NOT include any call cached tasks."
1574+
1575+
return $r
1576+
}
1577+
1578+
# Get the cost for a workflow ID:
1579+
function cost-detailed()
1580+
{
1581+
local id=$1
1582+
local svr=$2
1583+
_cost_helper $id $svr
1584+
1585+
local tmp_cost_file=$( makeTemp )
1586+
1587+
# Get the cost from Big Query:
1588+
bq query \
1589+
--use_legacy_sql=false \
1590+
"SELECT
1591+
wfid.value, service.description, task.value as task_name, sum(cost) as cost
1592+
FROM
1593+
\`${COST_TABLE}\` as billing, UNNEST(labels) as wfid, UNNEST(labels) as task
1594+
WHERE
1595+
cost > 0
1596+
AND task.key LIKE \"wdl-task-name\"
1597+
AND wfid.key LIKE \"cromwell-workflow-id\"
1598+
AND wfid.value like \"%${id}\"
1599+
AND _PARTITIONDATE BETWEEN \"${START_DATE}\" AND \"${END_DATE}\"
1600+
GROUP BY 1,2,3
1601+
ORDER BY 4 DESC
1602+
;" | tail -n+4 | grep -v '^+' | tr -d '|' | awk 'BEGIN{OFS="\t"}{print $(NF-1), $NF}' | sort > ${tmp_cost_file}
1603+
1604+
r=$?
1605+
1606+
local total_cost=$(awk '{print $2}' ${tmp_cost_file} | tr '\n' '+' | sed 's#$#0#' | bc)
1607+
local total_cost=$(echo "scale=2;${total_cost}/1" | bc)
1608+
1609+
local tmpf2=$(makeTemp)
1610+
echo -e "TASK\tCOST" > ${tmpf2}
1611+
while read line ; do
1612+
local task=$(echo $line | awk '{print $1}')
1613+
local task_cost=$(echo $line | awk '{print $2}')
1614+
task_cost=$(echo "scale=2;if ( ${task_cost} >= 0.01 ) { ${task_cost}/1; } else { 0.01 }" | bc)
1615+
printf "${task}\t$%02.2f\n" ${task_cost}
1616+
done < ${tmp_cost_file} >> ${tmpf2}
1617+
1618+
column -t ${tmpf2} | head -n1
1619+
local bar_width=$( column -t ${tmpf2} | head -n1 | wc -c )
1620+
python -c "print('=' * ${bar_width})"
1621+
column -t ${tmpf2} | tail -n+2
1622+
python -c "print('=' * ${bar_width})"
1623+
echo "Total Cost: \$${total_cost}"
1624+
1625+
error ""
1626+
error "Costs rounded to nearest cent (approximately)."
1627+
error ""
1628+
error "WARNING: Costs here DO NOT include any call cached tasks."
1629+
1630+
return $r
1631+
}
1632+
14741633
function assertValidEmail()
14751634
{
14761635
# Make sure the user gave us a good email address:
@@ -1885,7 +2044,7 @@ if ${ISINTERACTIVESHELL} ; then
18852044

18862045
# Validate our sub-command:
18872046
case ${SUB_COMMAND} in
1888-
cleanup|submit|status|logs|execution-status-count|counts|metadata|slim-metadata|timing|abort|notify|list|fetch-all|fetch-logs|list-outputs|alias)
2047+
cleanup|submit|status|cost|cost-detailed|logs|execution-status-count|counts|metadata|slim-metadata|timing|abort|notify|list|fetch-all|fetch-logs|list-outputs|alias)
18892048
# This is a good sub-command, so we do not need to do anything.
18902049
;;
18912050
_rawNotify)

0 commit comments

Comments
 (0)