New metric definitions for llama-3-3-70b as judge in Arena Hard benchmark #7451

	name: Test Library Code

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]

	concurrency:
	group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number \|\| github.ref_name }}
	cancel-in-progress: true

	jobs:
	unittests:

	runs-on: ubuntu-latest
	timeout-minutes: 30
	env:
	OS: ubuntu-latest
	UNITXT_DEFAULT_VERBOSITY: error
	DATASETS_VERBOSITY: error
	HF_HUB_VERBOSITY: error
	HF_DATASETS_DISABLE_PROGRESS_BARS: "True"
	HF_HUB_DOWNLOAD_TIMEOUT: 60
	HF_HUB_ETAG_TIMEOUT: 60
	TQDM_DISABLE: "True"

	steps:

	- uses: actions/checkout@v5

	- uses: actions/setup-python@v5
	with:
	python-version: '3.10'
	cache: 'pip'

	- name: Install Dependencies
	run: \|
	bash utils/install.sh

	- uses: ./.github/actions/install-internal-pip
	with:
	ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }}

	- name: Run Tests
	run: coverage run -m unittest discover -s tests/library -p "test_*.py"

	- run: coverage report

	- name: Upload Coverage to Coveralls
	uses: coverallsapp/github-action@v2

Provide feedback