Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the dataset utlities for compiling files in parallel #36

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
12 changes: 6 additions & 6 deletions llvm_ir_dataset_utils/compile_time_analysis_tools/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@ SHELL := /bin/bash
WILD := $(shell echo {$(begin)..$(end)})

ifneq ($(CC), clang)
$(warning WARNING: SETTING CC TO clang OR clang++)
override CC := clang
ifeq ($(lang), cpp)
override CC := clang++
endif
$(warning WARNING: SETTING CC TO clang OR clang++)
override CC := clang
ifeq ($(lang), cpp)
override CC := clang++
endif
endif

all: $(WILD)

$(WILD):
@perf stat --no-big-num -e instructions:u -o \
$(lang)/perf_stat_files/[email protected] \
$(CC) -O3 -c $(lang)/bc_files/[email protected] \
$(CC) -O3 -c $(lang)/bc_files/[email protected] \
-o $(lang)/object_files/[email protected]
@instruct=$$(awk '/instructions/ {print $$1}' \
$(lang)/perf_stat_files/[email protected]); \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,41 @@
#!/bin/bash
set -o errexit

#USAGE
#./create_batch_files.sh <STORAGE_PATH> <MAKEFILE_PATH>

if [ -z "$1" ]; then
STORAGE="/lustre/schandra_crpl/users/3302/ir_bc_files/"
STORAGE="/tmp"
else
STORAGE="$1"
fi
if [ -z "$2" ]; then
MAKE_PATH="/home/3302/hf_py_code/compile/codes/batch_jobs/makefile_dir/"
MAKE_PATH=".."
else
MAKE_PATH="$2"
fi

lang=("c" "cpp" "julia" "rust" "swift")
array1=(0 31653 87225 144641 353700)
sizes=(31653 55572 57416 209059 49051)
lang=()
start_ids=()
sizes=()

while IFS=',' read -r language start_index end_index; do
lang+=($language)
start_ids+=($start_index)
sizes+=($((${end_index}-${start_index})))
done < <(tail -n +2 "../dataset_download/indices.csv")

length=${#lang[@]}

for (( i=0; i<$length; i++ ))
do
js="${lang[$i]}_batch.sh"
cp job_template.sh $js
echo "#SBATCH --output=${STORAGE}${lang[$i]}/job_results/slurm-%A_%a.out" >> $js
echo "#SBATCH --error=${STORAGE}${lang[$i]}/job_results/slurm-%A_%a.out" >> $js
echo "START=${array1[$i]}" >> $js
echo "#SBATCH --output=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "#SBATCH --error=${STORAGE}/${lang[$i]}/job_results/slurm-%A_%a.out" >> $js

echo "START=${start_ids[$i]}" >> $js
echo "TYPE=${lang[$i]}" >> $js
echo "SIZE=${sizes[$i]}" >> $js
echo "STORAGE=${STORAGE}" >> $js
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
#SBATCH --partition=standard
#SBATCH --time=0-00:10:00
#SBATCH --export=NONE
# NUMBER OF JOBS: 400
#SBATCH --array=0-399

Original file line number Diff line number Diff line change
@@ -1,34 +1,44 @@
#!/bin/bash
PREFIX=/lustre/schandra_crpl/users/3302/ir_bc_files/
set -o errexit
#Usage:
#./combine_outputs.sh <language> [storage]

language=($1)
if [ ${#language[@]} -eq 0 ]; then
echo "Missing language argument."
if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

cd ${PREFIX}
for dir in "${language[@]}"; do
echo "file, text_segment_size" \
> ${dir}/results/${dir}_text_segments.csv
echo "file, instructions" \
> ${dir}/results/${dir}_instructions.csv
for ps in ${dir}/ps_*; do
cat ${ps}/text_segments.csv \
>> ${dir}/results/${dir}_text_segments.csv
cat ${ps}/instructions.csv \
>> ${dir}/results/${dir}_instructions.csv
done
sort -nk1.5 ${dir}/results/${dir}_text_segments.csv \
-o ${dir}/results/${dir}_text_segments.csv
sort -nk1.5 ${dir}/results/${dir}_instructions.csv \
-o ${dir}/results/${dir}_instructions.csv
awk -F, 'NR==FNR{a[NR]=$1","$2; next} {print a[FNR], $2}' \
OFS=, ${dir}/results/${dir}_text_segments.csv \
${dir}/results/${dir}_instructions.csv \
> ${dir}/results/${dir}_combined_results.csv
sed -n -i '/, ,/!p' ${dir}/results/${dir}_combined_results.csv
rm ${dir}/results/${dir}_instructions.csv \
${dir}/results/${dir}_text_segments.csv
rm -r ${dir}/ps_*
if [ -z "$2" ]; then
STORAGE="/tmp"
else
STORAGE="$2"
fi


cd ${STORAGE}

echo "file, text_segment_size" \
> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
echo "file, instructions" \
> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
for ps in ${LANGUAGE}/ps_*; do
cat ${ps}/text_segments.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
cat ${ps}/instructions.csv \
>> ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
done
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
sort -nk1.5 ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
-o ${LANGUAGE}/results/${LANGUAGE}_instructions.csv
awk -F, 'NR==FNR{a[NR]=$1","$2; next} {print a[FNR], $2}' \
OFS=, ${LANGUAGE}/results/${LANGUAGE}_text_segments.csv \
${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
> ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
sed -n -i '/, ,/!p' ${LANGUAGE}/results/${LANGUAGE}_combined_results.csv
rm ${LANGUAGE}/results/${LANGUAGE}_instructions.csv \
${LANGUAGE}/results/${LANGUAGE}_text_segments.csv
rm -r ${LANGUAGE}/ps_*

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -o errexit
#Usage:
#./create_tar.sh <language> [storage]

if [ -z "$1" ]; then
echo "Missing language argument."
exit 1
else
LANGUAGE="$1"
fi

if [ -z "$2" ]; then
STORAGE="/tmp"
else
STORAGE="$2"
fi

cd ${STORAGE}/${LANGUAGE}

for dir in [0-9]*_temp; do
cd $dir
tar --append --file="../${LANGUAGE}_bc_files.tar" \
--transform=s,^,bc_files/, file[0-9]*.bc
cd ..

rm -r "${dir}"
done

Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: ascii -*-
from datasets import load_dataset, parallel
import os
import multiprocessing
import csv
from sys import argv

# Usage:
# python write_data_files.py [STORAGE]

STORAGE: str
if len(argv) > 1:
STORAGE = argv[1]
else:
STORAGE = '/tmp'

lang_list: [str]
global j
global dir_name
j: int
dir_name: str
BATCH_SIZE: int = 15000
file_indices: [dict] = []


def write_file(index: [int], bytes_item: [bytes]):
filename = f'{dir_name}/file{index+j+1}.bc'
with open(filename, 'wb') as file:
file.write(bytes_item)


with parallel.parallel_backend('spark'):
dataset = load_dataset('llvm-ml/ComPile', split='train', num_proc=2)

lang_list = dataset["language"]
langs = dataset.unique("language")
pool = multiprocessing.pool.ThreadPool(processes=multiprocessing.cpu_count())

for i in range(0, len(langs)):
start_index = lang_list.index(langs[i])
if (i+1 != len(langs)):
end_index = lang_list.index(langs[i+1])
else:
end_index = len(lang_list)
file_indices.append(
{"language": langs[i], "start_index": start_index, "end_index": end_index})
for j in range(start_index, end_index, BATCH_SIZE):
dir_name = os.path.join(STORAGE, f'{STORAGE}/{langs[i]}/{j}_temp')
os.makedirs(dir_name, exist_ok=True)
bytes_enumeration = enumerate(
dataset[j:j+BATCH_SIZE if (j+BATCH_SIZE <= end_index) else end_index]['content'])
pool.starmap(write_file, bytes_enumeration)

pool.close()

with open('indices.csv', mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=[
"language", "start_index", "end_index"], dialect='unix', quoting=csv.QUOTE_NONE)
writer.writeheader()
writer.writerows(file_indices)
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,34 @@
import numpy as np
import matplotlib.pyplot as plt
from read_column import open_and_load
# Usage:
# from fitting_and_plotting import plot_functionality

def plot_functionality(lang: str, show: bool=False)->None:
textseg_data, inst_data = open_and_load(lang)
c, b, a = np.polyfit(textseg_data, inst_data, 2)
# while(c <= 0):
# index = textseg_data.index(max(textseg_data))
# del textseg_data[index]
# del inst_data[index]
# c, b, a = np.polyfit(textseg_data, inst_data, 2)

x_axis = range(min(textseg_data), max(textseg_data), 10)
z = np.polyval([c,b,a], x_axis)

plt.scatter(textseg_data,inst_data)
plt.xscale("log")
plt.yscale("log")
# plt.gca().set_ylim([min(inst_data), 10**13])
plt.gca().set_ylim([10**8, 10**13])
plt.xlabel("text_segment_size (bytes)")
plt.ylabel("compiler_cpu_instructions_count")
if (lang == "cpp"):
plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
else:
plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")")
plt.plot(x_axis,z, 'r')
equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
plt.legend([f"fit: {equation}", "original"])
if (show):
plt.show()
else:
plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
def plot_functionality(lang: str, show: bool = False) -> None:
'''
Function to graph csv data for text segment size and instructions counts.
'''
textseg_data, inst_data = open_and_load(lang)
c, b, a = np.polyfit(textseg_data, inst_data, 2)

x_axis = range(min(textseg_data), max(textseg_data), 10)
z = np.polyval([c, b, a], x_axis)

plt.scatter(textseg_data, inst_data)
plt.xscale("log")
plt.yscale("log")
plt.gca().set_ylim([10**8, 10**13])
plt.xlabel("text_segment_size (bytes)")
plt.ylabel("compiler_cpu_instructions_count")
if (lang == "cpp"):
plt.title("Clang++ Compiler Instructions vs. Text Segment Size ("+lang+")")
else:
plt.title("Clang Compiler Instructions vs. Text Segment Size ("+lang+")")
plt.plot(x_axis, z, 'r')
equation = f"${c:.1e}x^2 + {b:.1e}x + {a:.1e}$"
plt.legend([f"fit: {equation}", "original"])
if (show):
plt.show()
else:
plt.savefig(fname=lang+"_instvtext.pdf", format="pdf")
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
# -*- coding: ascii -*-
import csv

def open_and_load(lang: str)->[int]:
# PREFIX:str="/home/3302/hf_py_code/compile/codes/results/"
#PREFIX:str="/lustre/schandra_crpl/users/3302/ir_bc_files/"+lang+"/results/"
PREFIX:str="/home/3302/hf_py_code/compile/csv_data/inst_scatterplots/"
textseg_data: [int] = []
inst_data: [int] = []
# with open(PREFIX+lang+"_text_segments.csv", mode='r', newline='') as file:
with open(PREFIX+lang+"_combined_results.csv", mode='r', newline='') as file:
for x in csv.DictReader(file):
textseg_data.append(int(x[" text_segment_size"]))

inst_data.append(int(x[" instructions"]))
# with open(PREFIX+lang+"_instructions.csv", mode='r', newline='') as file:
# for x in csv.DictReader(file):
return textseg_data, inst_data

def open_and_load(lang: str, STORAGE: str = '/tmp') -> [int]:
'''
Function to read csv files containing text segment size and instruction counts data.
'''
textseg_data: [int] = []
inst_data: [int] = []
with open(STORAGE+lang+"_combined_results.csv", mode='r', newline='') as file:
for x in csv.DictReader(file):
textseg_data.append(int(x[" text_segment_size"]))

inst_data.append(int(x[" instructions"]))
return textseg_data, inst_data