Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the dataset utlities for compiling files in parallel #36

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ llvm_ir_dataset_utils.egg-info/
*.tar
*.sif
*.swp
*~
25 changes: 25 additions & 0 deletions llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
SHELL := /bin/bash

WILD := $(shell echo {$(begin)..$(end)})

ifneq ($(CC), clang)
$(warning WARNING: SETTING CC TO clang OR clang++)
override CC := clang
ifeq ($(lang), cpp)
override CC := clang++
endif
endif

all: $(WILD)

$(WILD):
@perf stat --no-big-num -e instructions:u -o \
$(lang)/perf_stat_files/[email protected] \
$(CC) -O3 -c $(lang)/bc_files/[email protected] \
-o $(lang)/object_files/[email protected]
@instruct=$$(awk '/instructions/ {print $$1}' \
$(lang)/perf_stat_files/[email protected]); \
echo "file$@, $$instruct" >> $(lang)/instruction_counts/[email protected]
@size=$$(llvm-size $(lang)/object_files/[email protected] | awk 'NR==2 {print $$1}'); \
echo "file$@, $$size" >> $(lang)/textseg_sizes/[email protected]

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
THREADS=24
BATCH=$(($SIZE/$SLURM_ARRAY_TASK_MAX))
I=$((${SLURM_ARRAY_TASK_ID}*${BATCH}+1+${START}))
STOP=$(($I+${BATCH}-1))
if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]; then
STOP=$(($I+${SIZE}%$SLURM_ARRAY_TASK_MAX-1))
fi
cd $TMPDIR
mkdir -p ir_bc_files/ps_$I/${TYPE}
cd ir_bc_files/ps_$I/${TYPE}
mkdir -p bc_files instruction_counts perf_stat_files \
textseg_sizes object_files
eval tar --extract --file=${STORAGE}${TYPE}/${TYPE}_bc_files.tar \
bc_files/file{$I..$STOP}.bc
cd $TMPDIR/ir_bc_files/ps_$I
make --ignore-errors --makefile=${MAKE_PATH}Makefile \
--jobs=${THREADS} lang="${TYPE}" begin="$I" end="$STOP"
mkdir -p ${STORAGE}${TYPE}/ps_$I
> ${STORAGE}${TYPE}/ps_$I/text_segments.csv

> ${STORAGE}${TYPE}/ps_$I/instructions.csv

eval cat ${TYPE}/textseg_sizes/textseg{$I..$STOP}.csv \
>> ${STORAGE}${TYPE}/ps_$I/text_segments.csv
eval cat ${TYPE}/instruction_counts/inst{$I..$STOP}.csv \
>> ${STORAGE}${TYPE}/ps_$I/instructions.csv
cd ..
rm -r ps_$I

Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
set -o errexit

#USAGE
#./create_batch_files.sh <STORAGE_PATH> <MAKEFILE_PATH>

if [ -z "$1" ]; then
STORAGE="/tmp"
else
STORAGE="$1"
fi
if [ -z "$2" ]; then
MAKE_PATH=".."
else
MAKE_PATH="$2"
fi

lang=()
start_ids=()
sizes=()

while IFS=',' read -r language start_index end_index; do
lang+=($language)
start_ids+=($start_index)
sizes+=($((${end_index}-${start_index})))
done < <(tail -n +2 "../dataset_download/indices.csv")

length=${#lang[@]}

for (( i=0; i<$length; i++ ))
do
js="${lang[$i]}_batch.sh"
cp job_template.sh $js
echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "START=${start_ids[$i]}" >> $js
echo "TYPE=${lang[$i]}" >> $js
echo "SIZE=${sizes[$i]}" >> $js
echo "STORAGE=${STORAGE}" >> $js
echo "MAKE_PATH=${MAKE_PATH}" >> $js
cat batch_main_body.sh >> $js
chmod 744 $js
done

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash -l
#
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=12
#SBATCH --job-name=compiler_batch
#SBATCH --partition=standard
#SBATCH --time=0-00:10:00
#SBATCH --export=NONE
# NUMBER OF JOBS: 400
#SBATCH --array=0-399

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
set -o errexit
#Usage:
#./combine_outputs.sh <language> [storage]

if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

if [ -z "$2" ]; then
STORAGE="/tmp"
else
STORAGE="$2"
fi


cd ${STORAGE}

echo "file, text_segment_size" \
> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
echo "file, instructions" \
> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
for ps in ${LANGUAGE}/ps_*; do
cat ${ps}/text_segments.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
cat ${ps}/instructions.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
done
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
awk -F, 'NR==FNR{a[NR]=$1","$2; next} {print a[FNR], $2}' \
OFS=, ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
> ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
sed -n -i '/, ,/!p' ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
rm ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
rm -r ${LANGUAGE}/ps_*