Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the dataset utlities for compiling files in parallel #36

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
25 changes: 25 additions & 0 deletions llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
SHELL := /bin/bash

WILD := $(shell echo {$(begin)..$(end)})

ifneq ($(CC), clang)
$(warning WARNING: SETTING CC TO clang OR clang++)
override CC := clang
ifeq ($(lang), cpp)
override CC := clang++
endif
endif

all: $(WILD)

$(WILD):
@perf stat --no-big-num -e instructions:u -o \
$(lang)/perf_stat_files/[email protected] \
$(CC) -O3 -c $(lang)/bc_files/[email protected] \
-o $(lang)/object_files/[email protected]
@instruct=$$(awk '/instructions/ {print $$1}' \
$(lang)/perf_stat_files/[email protected]); \
echo "file$@, $$instruct" >> $(lang)/instruction_counts/[email protected]
@size=$$(llvm-size $(lang)/object_files/[email protected] | awk 'NR==2 {print $$1}'); \
echo "file$@, $$size" >> $(lang)/textseg_sizes/[email protected]

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
THREADS=24
BATCH=$(($SIZE/$SLURM_ARRAY_TASK_MAX))
I=$((${SLURM_ARRAY_TASK_ID}*${BATCH}+1+${START}))
STOP=$(($I+${BATCH}-1))
if [ $SLURM_ARRAY_TASK_ID -eq $SLURM_ARRAY_TASK_MAX ]; then
STOP=$(($I+${SIZE}%$SLURM_ARRAY_TASK_MAX-1))
fi
cd $TMPDIR
mkdir -p ir_bc_files/ps_$I/${TYPE}
cd ir_bc_files/ps_$I/${TYPE}
mkdir -p bc_files instruction_counts perf_stat_files \
textseg_sizes object_files
eval tar --extract --file=${STORAGE}${TYPE}/${TYPE}_bc_files.tar \
bc_files/file{$I..$STOP}.bc
cd $TMPDIR/ir_bc_files/ps_$I
make --ignore-errors --makefile=${MAKE_PATH}Makefile \
--jobs=${THREADS} lang="${TYPE}" begin="$I" end="$STOP"
mkdir -p ${STORAGE}${TYPE}/ps_$I
> ${STORAGE}${TYPE}/ps_$I/text_segments.csv

> ${STORAGE}${TYPE}/ps_$I/instructions.csv

eval cat ${TYPE}/textseg_sizes/textseg{$I..$STOP}.csv \
>> ${STORAGE}${TYPE}/ps_$I/text_segments.csv
eval cat ${TYPE}/instruction_counts/inst{$I..$STOP}.csv \
>> ${STORAGE}${TYPE}/ps_$I/instructions.csv
cd ..
rm -r ps_$I

Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
set -o errexit

#USAGE
#./create_batch_files.sh <STORAGE_PATH> <MAKEFILE_PATH>

if [ -z "$1" ]; then
STORAGE="/tmp"
else
STORAGE="$1"
fi
if [ -z "$2" ]; then
MAKE_PATH=".."
else
MAKE_PATH="$2"
fi

lang=()
start_ids=()
sizes=()

while IFS=',' read -r language start_index end_index; do
lang+=($language)
start_ids+=($start_index)
sizes+=($((${end_index}-${start_index})))
done < <(tail -n +2 "../dataset_download/indices.csv")

length=${#lang[@]}

for (( i=0; i<$length; i++ ))
do
js="${lang[$i]}_batch.sh"
cp job_template.sh $js
echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "START=${start_ids[$i]}" >> $js
echo "TYPE=${lang[$i]}" >> $js
echo "SIZE=${sizes[$i]}" >> $js
echo "STORAGE=${STORAGE}" >> $js
echo "MAKE_PATH=${MAKE_PATH}" >> $js
cat batch_main_body.sh >> $js
chmod 744 $js
done

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash -l
#
#SBATCH --nodes=1 --ntasks=1 --cpus-per-task=12
#SBATCH --job-name=compiler_batch
#SBATCH --partition=standard
#SBATCH --time=0-00:10:00
#SBATCH --export=NONE
# NUMBER OF JOBS: 400
#SBATCH --array=0-399

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
set -o errexit
#Usage:
#./combine_outputs.sh <language> [storage]

if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

if [ -z "$2" ]; then
STORAGE="/tmp"
else
STORAGE="$2"
fi


cd ${STORAGE}

echo "file, text_segment_size" \
> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
echo "file, instructions" \
> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
for ps in ${LANGUAGE}/ps_*; do
cat ${ps}/text_segments.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
cat ${ps}/instructions.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
done
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
awk -F, 'NR==FNR{a[NR]=$1","$2; next} {print a[FNR], $2}' \
OFS=, ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
> ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
sed -n -i '/, ,/!p' ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
rm ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
rm -r ${LANGUAGE}/ps_*

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -o errexit
#Usage:
#./create_tar.sh <language> [storage]

if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

if [ -z "$2" ]; then
STORAGE="/tmp"
else
STORAGE="$2"
fi

cd ${STORAGE}/${LANGUAGE}

for dir in [0-9]*_temp; do
cd $dir
tar --append --file="../${LANGUAGE}_bc_files.tar" \
--transform=s,^,bc_files/, file[0-9]*.bc
cd ..

rm -r "${dir}"
done

Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: ascii -*-
from datasets import load_dataset, parallel
import os
import multiprocessing
import csv
from sys import argv

# Usage:
# python write_data_files.py [STORAGE]

STORAGE: str
if len(argv) > 1:
STORAGE = argv[1]
else:
STORAGE = '/tmp'

lang_list: [str]
global j
global dir_name
j: int
dir_name: str
BATCH_SIZE: int = 15000
file_indices: [dict] = []


def write_file(index: [int], bytes_item: [bytes]):
filename = f'{dir_name}/file{index+j+1}.bc'
with open(filename, 'wb') as file:
file.write(bytes_item)


with parallel.parallel_backend('spark'):
dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)

lang_list = dataset["language"]
langs = dataset.unique("language")
pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count())

for i in range(0, len(langs)):
start_index = lang_list.index(langs[i])
if (i+1 != len(langs)):
end_index = lang_list.index(langs[i+1])
else:
end_index = len(lang_list)
file_indices.append(
{"language": langs[i], "start_index": start_index, "end_index": end_index})
for j in range(start_index, end_index, BATCH_SIZE):
dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp')
os.makedirs(dir_name, exist_ok=True)
bytes_enumeration = enumerate(
dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content'])
pool.starmap(write_file, bytes_enumeration)

pool.close()

with open('indices.csv', mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=[
"language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
writer.writeheader()
writer.writerows(file_indices)
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding: ascii -*-
import numpy as np
import matplotlib.pyplot as plt
from read_column import open_and_load
# Usage:
# from fitting_and_plotting import plot_functionality


def plot_functionality(lang: str, show: bool = False) -> None:
'''
Function to graph csv data for text segment size and instructions counts.
'''
textseg_data, inst_data = open_and_load(lang)
c, b, a = np.polyfit(textseg_data, inst_data, 2)

x_axis = range(min(textseg_data), max(textseg_data), 10)
z = np.polyval([c, b, a], x_axis)

plt.scatter(textseg_data, inst_data)
plt.xscale("log")
plt.yscale("log")
plt.gca().set_ylim([10**8, 10**13])
plt.xlabel("text_segment_size (bytes)")
plt.ylabel("compiler_cpu_instructions_count")
if (lang == "cpp"):
plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
else:
plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")")
plt.plot(x_axis, z, 'r')
equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
plt.legend([f"fit: {equation}", "original"])
if (show):
plt.show()
else:
plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- coding: ascii -*-
import csv


def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]:
'''
Function to read csv files containing text segment size and instruction counts data.
'''
textseg_data: [int] = []
inst_data: [int] = []
with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file:
for x in csv.DictReader(file):
textseg_data.append(int(x[" text_segment_size"]))

inst_data.append(int(x[" instructions"]))
return textseg_data, inst_data