diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c7cbc202e1..e798c1a7c9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -177,6 +177,9 @@ jobs: build-macos: runs-on: macos-14 + strategy: + matrix: + arch: [x86_64, arm64] steps: - name: "Checkout repo" uses: actions/checkout@v4 @@ -236,7 +239,7 @@ jobs: cd build cmake .. ${{ env.BUILD_FLAGS }} \ -DCMAKE_BUILD_TYPE=${{ env.BUILD_MODE }} \ - -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DCMAKE_OSX_ARCHITECTURES=${{ matrix.arch }} \ -DMACOS_BUNDLE=ON \ -G Ninja @@ -259,5 +262,5 @@ jobs: - name: Upload artifact uses: actions/upload-artifact@v4 with: - name: cemu-bin-macos-x64 + name: cemu-bin-macos-${{ matrix.arch }} path: ./bin/Cemu.dmg diff --git a/CMakeLists.txt b/CMakeLists.txt index eb848ce71f..c70b0a40dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,7 +222,12 @@ endif() add_subdirectory("dependencies/ih264d" EXCLUDE_FROM_ALL) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)") +if (CMAKE_OSX_ARCHITECTURES) + set(CEMU_ARCHITECTURE ${CMAKE_OSX_ARCHITECTURES}) +else() + set(CEMU_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR}) +endif() +if(CEMU_ARCHITECTURE MATCHES "(aarch64)|(AARCH64)|(arm64)|(ARM64)") add_subdirectory("dependencies/xbyak_aarch64" EXCLUDE_FROM_ALL) endif() @@ -231,4 +236,4 @@ if (NOT ZArchive_FOUND) add_subdirectory("dependencies/ZArchive" EXCLUDE_FROM_ALL) endif() -add_subdirectory(src) +add_subdirectory(src) \ No newline at end of file diff --git a/dependencies/ih264d/CMakeLists.txt b/dependencies/ih264d/CMakeLists.txt index 686a9d08ae..64ac0931f8 100644 --- a/dependencies/ih264d/CMakeLists.txt +++ b/dependencies/ih264d/CMakeLists.txt @@ -183,6 +183,9 @@ target_sources(ih264d PRIVATE "decoder/arm/ih264d_function_selector.c" ) target_compile_options(ih264d PRIVATE -DARMV8) +if(APPLE) + target_sources(ih264d PRIVATE "common/armv8/macos_arm_symbol_aliases.s") +endif() else() message(FATAL_ERROR "ih264d unknown architecture: ${IH264D_ARCHITECTURE}") endif() diff --git a/dependencies/ih264d/common/armv8/ih264_intra_pred_chroma_av8.s b/dependencies/ih264d/common/armv8/ih264_intra_pred_chroma_av8.s index 39c02560fc..c0d9cf99ee 100644 --- a/dependencies/ih264d/common/armv8/ih264_intra_pred_chroma_av8.s +++ b/dependencies/ih264d/common/armv8/ih264_intra_pred_chroma_av8.s @@ -429,8 +429,13 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: rev64 v7.4h, v2.4h ld1 {v3.2s}, [x10] sub x5, x3, #8 +#ifdef __APPLE__ + adrp x12, _ih264_gai1_intrapred_chroma_plane_coeffs1@GOTPAGE + ldr x12, [x12, _ih264_gai1_intrapred_chroma_plane_coeffs1@GOTPAGEOFF] +#else adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1] +#endif usubl v10.8h, v5.8b, v1.8b ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3 mov v8.d[1], v9.d[0] @@ -484,10 +489,13 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: zip1 v1.8h, v0.8h, v2.8h zip2 v2.8h, v0.8h, v2.8h mov v0.16b, v1.16b - +#ifdef __APPLE__ + adrp x12, _ih264_gai1_intrapred_chroma_plane_coeffs2@GOTPAGE + ldr x12, [x12, _ih264_gai1_intrapred_chroma_plane_coeffs2@GOTPAGEOFF] +#else adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2] - +#endif ld1 {v8.2s, v9.2s}, [x12] mov v8.d[1], v9.d[0] mov v10.16b, v8.16b diff --git a/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_16x16_av8.s index fa19c12132..2422d8cd16 100644 --- a/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_16x16_av8.s +++ b/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_16x16_av8.s @@ -431,10 +431,13 @@ ih264_intra_pred_luma_16x16_mode_plane_av8: mov x10, x1 //top_left mov x4, #-1 ld1 {v2.2s}, [x1], x8 - +#ifdef __APPLE__ + adrp x7, _ih264_gai1_intrapred_luma_plane_coeffs@GOTPAGE + ldr x7, [x7, _ih264_gai1_intrapred_luma_plane_coeffs@GOTPAGEOFF] +#else adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs] - +#endif ld1 {v0.2s}, [x1] rev64 v2.8b, v2.8b ld1 {v6.2s, v7.2s}, [x7] diff --git a/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_8x8_av8.s index 273aa81b82..6fa31dedb3 100644 --- a/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_8x8_av8.s +++ b/dependencies/ih264d/common/armv8/ih264_intra_pred_luma_8x8_av8.s @@ -1029,9 +1029,13 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8: mov v3.d[0], v2.d[1] ext v4.16b, v2.16b , v2.16b , #1 mov v5.d[0], v4.d[1] - +#ifdef __APPLE__ + adrp x12, _ih264_gai1_intrapred_luma_8x8_horz_u@GOTPAGE + ldr x12, [x12, _ih264_gai1_intrapred_luma_8x8_horz_u@GOTPAGEOFF] +#else adrp x12, :got:ih264_gai1_intrapred_luma_8x8_horz_u ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_luma_8x8_horz_u] +#endif uaddl v20.8h, v0.8b, v2.8b uaddl v22.8h, v1.8b, v3.8b uaddl v24.8h, v2.8b, v4.8b diff --git a/dependencies/ih264d/common/armv8/ih264_weighted_bi_pred_av8.s b/dependencies/ih264d/common/armv8/ih264_weighted_bi_pred_av8.s index 475f690ec5..8d6aa995e2 100644 --- a/dependencies/ih264d/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/dependencies/ih264d/common/armv8/ih264_weighted_bi_pred_av8.s @@ -142,14 +142,22 @@ ih264_weighted_bi_pred_luma_av8: sxtw x4, w4 sxtw x5, w5 stp x19, x20, [sp, #-16]! +#ifndef __APPLE__ ldr w8, [sp, #80] //Load wt2 in w8 ldr w9, [sp, #88] //Load ofst1 in w9 - add w6, w6, #1 //w6 = log_WD + 1 - neg w10, w6 //w10 = -(log_WD + 1) - dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit) ldr w10, [sp, #96] //Load ofst2 in w10 ldr w11, [sp, #104] //Load ht in w11 ldr w12, [sp, #112] //Load wd in w12 +#else + ldr w8, [sp, #80] //Load wt2 in w8 + ldr w9, [sp, #84] //Load ofst1 in w9 + ldr w10, [sp, #88] //Load ofst2 in w10 + ldr w11, [sp, #92] //Load ht in w11 + ldr w12, [sp, #96] //Load wd in w12 +#endif + add w6, w6, #1 //w6 = log_WD + 1 + neg w10, w6 //w10 = -(log_WD + 1) + dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit) add w9, w9, #1 //w9 = ofst1 + 1 add w9, w9, w10 //w9 = ofst1 + ofst2 + 1 mov v2.s[0], w7 @@ -424,17 +432,24 @@ ih264_weighted_bi_pred_chroma_av8: sxtw x5, w5 stp x19, x20, [sp, #-16]! - +#ifndef __APPLE__ ldr w8, [sp, #80] //Load wt2 in w8 + ldr w9, [sp, #88] //Load ofst1 in w9 + ldr w10, [sp, #96] //Load ofst2 in w10 + ldr w11, [sp, #104] //Load ht in w11 + ldr w12, [sp, #112] //Load wd in w12 +#else + ldr w8, [sp, #80] //Load wt2 in w8 + ldr w9, [sp, #84] //Load ofst1 in w9 + ldr w10, [sp, #88] //Load ofst2 in w10 + ldr w11, [sp, #92] //Load ht in w11 + ldr w12, [sp, #96] //Load wd in w12 +#endif dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit) dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit) add w6, w6, #1 //w6 = log_WD + 1 - ldr w9, [sp, #88] //Load ofst1 in w9 - ldr w10, [sp, #96] //Load ofst2 in w10 neg w20, w6 //w20 = -(log_WD + 1) dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit) - ldr w11, [sp, #104] //Load ht in x11 - ldr w12, [sp, #112] //Load wd in x12 dup v20.8h, w9 //0ffset1 dup v21.8h, w10 //0ffset2 srhadd v6.8b, v20.8b, v21.8b diff --git a/dependencies/ih264d/common/armv8/macos_arm_symbol_aliases.s b/dependencies/ih264d/common/armv8/macos_arm_symbol_aliases.s new file mode 100644 index 0000000000..3639f1b3fc --- /dev/null +++ b/dependencies/ih264d/common/armv8/macos_arm_symbol_aliases.s @@ -0,0 +1,185 @@ +// macOS clang compilers append preceding underscores to function names, this is to prevent +// mismatches with the assembly function names and the C functions as defined in the header. + +.global _ih264_deblk_chroma_horz_bs4_av8 +_ih264_deblk_chroma_horz_bs4_av8 = ih264_deblk_chroma_horz_bs4_av8 + +.global _ih264_deblk_chroma_horz_bslt4_av8 +_ih264_deblk_chroma_horz_bslt4_av8 = ih264_deblk_chroma_horz_bslt4_av8 + +.global _ih264_deblk_chroma_vert_bs4_av8 +_ih264_deblk_chroma_vert_bs4_av8 = ih264_deblk_chroma_vert_bs4_av8 + +.global _ih264_deblk_chroma_vert_bslt4_av8 +_ih264_deblk_chroma_vert_bslt4_av8 = ih264_deblk_chroma_vert_bslt4_av8 + +.global _ih264_deblk_luma_horz_bs4_av8 +_ih264_deblk_luma_horz_bs4_av8 = ih264_deblk_luma_horz_bs4_av8 + +.global _ih264_deblk_luma_horz_bslt4_av8 +_ih264_deblk_luma_horz_bslt4_av8 = ih264_deblk_luma_horz_bslt4_av8 + +.global _ih264_deblk_luma_vert_bs4_av8 +_ih264_deblk_luma_vert_bs4_av8 = ih264_deblk_luma_vert_bs4_av8 + +.global _ih264_deblk_luma_vert_bslt4_av8 +_ih264_deblk_luma_vert_bslt4_av8 = ih264_deblk_luma_vert_bslt4_av8 + +.global _ih264_default_weighted_pred_chroma_av8 +_ih264_default_weighted_pred_chroma_av8 = ih264_default_weighted_pred_chroma_av8 + +.global _ih264_default_weighted_pred_luma_av8 +_ih264_default_weighted_pred_luma_av8 = ih264_default_weighted_pred_luma_av8 + +.global _ih264_ihadamard_scaling_4x4_av8 +_ih264_ihadamard_scaling_4x4_av8 = ih264_ihadamard_scaling_4x4_av8 + +.global _ih264_inter_pred_chroma_av8 +_ih264_inter_pred_chroma_av8 = ih264_inter_pred_chroma_av8 + +.global _ih264_inter_pred_luma_copy_av8 +_ih264_inter_pred_luma_copy_av8 = ih264_inter_pred_luma_copy_av8 + +.global _ih264_inter_pred_luma_horz_av8 +_ih264_inter_pred_luma_horz_av8 = ih264_inter_pred_luma_horz_av8 + +.global _ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 +_ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 = ih264_inter_pred_luma_horz_hpel_vert_hpel_av8 + +.global _ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 +_ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 = ih264_inter_pred_luma_horz_hpel_vert_qpel_av8 + +.global _ih264_inter_pred_luma_horz_qpel_av8 +_ih264_inter_pred_luma_horz_qpel_av8 = ih264_inter_pred_luma_horz_qpel_av8 + +.global _ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 +_ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 = ih264_inter_pred_luma_horz_qpel_vert_hpel_av8 + +.global _ih264_inter_pred_luma_horz_qpel_vert_qpel_av8 +_ih264_inter_pred_luma_horz_qpel_vert_qpel_av8 = ih264_inter_pred_luma_horz_qpel_vert_qpel_av8 + +.global _ih264_inter_pred_luma_vert_av8 +_ih264_inter_pred_luma_vert_av8 = ih264_inter_pred_luma_vert_av8 + +.global _ih264_inter_pred_luma_vert_qpel_av8 +_ih264_inter_pred_luma_vert_qpel_av8 = ih264_inter_pred_luma_vert_qpel_av8 + +.global _ih264_intra_pred_chroma_8x8_mode_horz_av8 +_ih264_intra_pred_chroma_8x8_mode_horz_av8 = ih264_intra_pred_chroma_8x8_mode_horz_av8 + +.global _ih264_intra_pred_chroma_8x8_mode_plane_av8 +_ih264_intra_pred_chroma_8x8_mode_plane_av8 = ih264_intra_pred_chroma_8x8_mode_plane_av8 + +.global _ih264_intra_pred_chroma_8x8_mode_vert_av8 +_ih264_intra_pred_chroma_8x8_mode_vert_av8 = ih264_intra_pred_chroma_8x8_mode_vert_av8 + +.global _ih264_intra_pred_luma_16x16_mode_dc_av8 +_ih264_intra_pred_luma_16x16_mode_dc_av8 = ih264_intra_pred_luma_16x16_mode_dc_av8 + +.global _ih264_intra_pred_luma_16x16_mode_horz_av8 +_ih264_intra_pred_luma_16x16_mode_horz_av8 = ih264_intra_pred_luma_16x16_mode_horz_av8 + +.global _ih264_intra_pred_luma_16x16_mode_plane_av8 +_ih264_intra_pred_luma_16x16_mode_plane_av8 = ih264_intra_pred_luma_16x16_mode_plane_av8 + +.global _ih264_intra_pred_luma_16x16_mode_vert_av8 +_ih264_intra_pred_luma_16x16_mode_vert_av8 = ih264_intra_pred_luma_16x16_mode_vert_av8 + +.global _ih264_intra_pred_luma_4x4_mode_dc_av8 +_ih264_intra_pred_luma_4x4_mode_dc_av8 = ih264_intra_pred_luma_4x4_mode_dc_av8 + +.global _ih264_intra_pred_luma_4x4_mode_diag_dl_av8 +_ih264_intra_pred_luma_4x4_mode_diag_dl_av8 = ih264_intra_pred_luma_4x4_mode_diag_dl_av8 + +.global _ih264_intra_pred_luma_4x4_mode_diag_dr_av8 +_ih264_intra_pred_luma_4x4_mode_diag_dr_av8 = ih264_intra_pred_luma_4x4_mode_diag_dr_av8 + +.global _ih264_intra_pred_luma_4x4_mode_horz_av8 +_ih264_intra_pred_luma_4x4_mode_horz_av8 = ih264_intra_pred_luma_4x4_mode_horz_av8 + +.global _ih264_intra_pred_luma_4x4_mode_horz_d_av8 +_ih264_intra_pred_luma_4x4_mode_horz_d_av8 = ih264_intra_pred_luma_4x4_mode_horz_d_av8 + +.global _ih264_intra_pred_luma_4x4_mode_horz_u_av8 +_ih264_intra_pred_luma_4x4_mode_horz_u_av8 = ih264_intra_pred_luma_4x4_mode_horz_u_av8 + +.global _ih264_intra_pred_luma_4x4_mode_vert_av8 +_ih264_intra_pred_luma_4x4_mode_vert_av8 = ih264_intra_pred_luma_4x4_mode_vert_av8 + +.global _ih264_intra_pred_luma_4x4_mode_vert_l_av8 +_ih264_intra_pred_luma_4x4_mode_vert_l_av8 = ih264_intra_pred_luma_4x4_mode_vert_l_av8 + +.global _ih264_intra_pred_luma_4x4_mode_vert_r_av8 +_ih264_intra_pred_luma_4x4_mode_vert_r_av8 = ih264_intra_pred_luma_4x4_mode_vert_r_av8 + +.global _ih264_intra_pred_luma_8x8_mode_dc_av8 +_ih264_intra_pred_luma_8x8_mode_dc_av8 = ih264_intra_pred_luma_8x8_mode_dc_av8 + +.global _ih264_intra_pred_luma_8x8_mode_diag_dl_av8 +_ih264_intra_pred_luma_8x8_mode_diag_dl_av8 = ih264_intra_pred_luma_8x8_mode_diag_dl_av8 + +.global _ih264_intra_pred_luma_8x8_mode_diag_dr_av8 +_ih264_intra_pred_luma_8x8_mode_diag_dr_av8 = ih264_intra_pred_luma_8x8_mode_diag_dr_av8 + +.global _ih264_intra_pred_luma_8x8_mode_horz_av8 +_ih264_intra_pred_luma_8x8_mode_horz_av8 = ih264_intra_pred_luma_8x8_mode_horz_av8 + +.global _ih264_intra_pred_luma_8x8_mode_horz_d_av8 +_ih264_intra_pred_luma_8x8_mode_horz_d_av8 = ih264_intra_pred_luma_8x8_mode_horz_d_av8 + +.global _ih264_intra_pred_luma_8x8_mode_horz_u_av8 +_ih264_intra_pred_luma_8x8_mode_horz_u_av8 = ih264_intra_pred_luma_8x8_mode_horz_u_av8 + +.global _ih264_intra_pred_luma_8x8_mode_vert_av8 +_ih264_intra_pred_luma_8x8_mode_vert_av8 = ih264_intra_pred_luma_8x8_mode_vert_av8 + +.global _ih264_intra_pred_luma_8x8_mode_vert_l_av8 +_ih264_intra_pred_luma_8x8_mode_vert_l_av8 = ih264_intra_pred_luma_8x8_mode_vert_l_av8 + +.global _ih264_intra_pred_luma_8x8_mode_vert_r_av8 +_ih264_intra_pred_luma_8x8_mode_vert_r_av8 = ih264_intra_pred_luma_8x8_mode_vert_r_av8 + +.global _ih264_iquant_itrans_recon_4x4_av8 +_ih264_iquant_itrans_recon_4x4_av8 = ih264_iquant_itrans_recon_4x4_av8 + +.global _ih264_iquant_itrans_recon_4x4_dc_av8 +_ih264_iquant_itrans_recon_4x4_dc_av8 = ih264_iquant_itrans_recon_4x4_dc_av8 + +.global _ih264_iquant_itrans_recon_8x8_av8 +_ih264_iquant_itrans_recon_8x8_av8 = ih264_iquant_itrans_recon_8x8_av8 + +.global _ih264_iquant_itrans_recon_8x8_dc_av8 +_ih264_iquant_itrans_recon_8x8_dc_av8 = ih264_iquant_itrans_recon_8x8_dc_av8 + +.global _ih264_iquant_itrans_recon_chroma_4x4_av8 +_ih264_iquant_itrans_recon_chroma_4x4_av8 = ih264_iquant_itrans_recon_chroma_4x4_av8 + +.global _ih264_iquant_itrans_recon_chroma_4x4_dc_av8 +_ih264_iquant_itrans_recon_chroma_4x4_dc_av8 = ih264_iquant_itrans_recon_chroma_4x4_dc_av8 + +.global _ih264_pad_left_chroma_av8 +_ih264_pad_left_chroma_av8 = ih264_pad_left_chroma_av8 + +.global _ih264_pad_left_luma_av8 +_ih264_pad_left_luma_av8 = ih264_pad_left_luma_av8 + +.global _ih264_pad_right_chroma_av8 +_ih264_pad_right_chroma_av8 = ih264_pad_right_chroma_av8 + +.global _ih264_pad_right_luma_av8 +_ih264_pad_right_luma_av8 = ih264_pad_right_luma_av8 + +.global _ih264_pad_top_av8 +_ih264_pad_top_av8 = ih264_pad_top_av8 + +.global _ih264_weighted_bi_pred_chroma_av8 +_ih264_weighted_bi_pred_chroma_av8 = ih264_weighted_bi_pred_chroma_av8 + +.global _ih264_weighted_bi_pred_luma_av8 +_ih264_weighted_bi_pred_luma_av8 = ih264_weighted_bi_pred_luma_av8 + +.global _ih264_weighted_pred_chroma_av8 +_ih264_weighted_pred_chroma_av8 = ih264_weighted_pred_chroma_av8 + +.global _ih264_weighted_pred_luma_av8 +_ih264_weighted_pred_luma_av8 = ih264_weighted_pred_luma_av8 \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ee7f8610df..04b6dfdd04 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -101,13 +101,21 @@ if (MACOS_BUNDLE) endforeach(folder) if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(LIBUSB_PATH "${CMAKE_BINARY_DIR}/vcpkg_installed/x64-osx/debug/lib/libusb-1.0.0.dylib") + set(LIBUSB_PATH "${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/debug/lib/libusb-1.0.0.dylib") else() - set(LIBUSB_PATH "${CMAKE_BINARY_DIR}/vcpkg_installed/x64-osx/lib/libusb-1.0.0.dylib") + set(LIBUSB_PATH "${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib/libusb-1.0.0.dylib") endif() + if (EXISTS "/usr/local/lib/libMoltenVK.dylib") + set(MOLTENVK_PATH "/usr/local/lib/libMoltenVK.dylib") + elseif (EXISTS "/opt/homebrew/lib/libMoltenVK.dylib") + set(MOLTENVK_PATH "/opt/homebrew/lib/libMoltenVK.dylib") + else() + message(FATAL_ERROR "failed to find libMoltenVK.dylib") + endif () + add_custom_command (TARGET CemuBin POST_BUILD - COMMAND ${CMAKE_COMMAND} ARGS -E copy "/usr/local/lib/libMoltenVK.dylib" "${CMAKE_SOURCE_DIR}/bin/${OUTPUT_NAME}.app/Contents/Frameworks/libMoltenVK.dylib" + COMMAND ${CMAKE_COMMAND} ARGS -E copy "${MOLTENVK_PATH}" "${CMAKE_SOURCE_DIR}/bin/${OUTPUT_NAME}.app/Contents/Frameworks/libMoltenVK.dylib" COMMAND ${CMAKE_COMMAND} ARGS -E copy "${LIBUSB_PATH}" "${CMAKE_SOURCE_DIR}/bin/${OUTPUT_NAME}.app/Contents/Frameworks/libusb-1.0.0.dylib" COMMAND ${CMAKE_COMMAND} ARGS -E copy "${CMAKE_SOURCE_DIR}/src/resource/update.sh" "${CMAKE_SOURCE_DIR}/bin/${OUTPUT_NAME}.app/Contents/MacOS/update.sh" COMMAND bash -c "install_name_tool -add_rpath @executable_path/../Frameworks ${CMAKE_SOURCE_DIR}/bin/${OUTPUT_NAME}.app/Contents/MacOS/${OUTPUT_NAME}" diff --git a/src/Cafe/CMakeLists.txt b/src/Cafe/CMakeLists.txt index 71866b2155..64baa33799 100644 --- a/src/Cafe/CMakeLists.txt +++ b/src/Cafe/CMakeLists.txt @@ -537,7 +537,7 @@ if(APPLE) target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm") endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)") +if(CEMU_ARCHITECTURE MATCHES "(aarch64)|(AARCH64)|(arm64)|(ARM64)") target_sources(CemuCafe PRIVATE HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.h diff --git a/src/Cafe/HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp b/src/Cafe/HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp index cb71234d83..728460a4a8 100644 --- a/src/Cafe/HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp @@ -169,8 +169,10 @@ struct AArch64GenContext_t : CodeGenerator bool processAllJumps() { - for (auto&& [jumpStart, jumpInfo] : jumps) + for (auto jump : jumps) { + auto jumpStart = jump.first; + auto jumpInfo = jump.second; bool success = std::visit( [&, this](const auto& jump) { setSize(jumpStart); diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index aec51725f4..2bbb617d7c 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -6,6 +6,8 @@ #if defined(ARCH_X86_64) && defined(__GNUC__) #include +#elif defined(__aarch64__) +#include #endif struct @@ -502,6 +504,114 @@ void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDat indexMax = std::max(indexMax, _maxIndex); indexMin = std::min(indexMin, _minIndex); } +#elif defined(__aarch64__) + +void LatteIndices_fastConvertU16_NEON(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const uint16* indicesU16BE = (const uint16*)indexDataInput; + uint16* indexOutput = (uint16*)indexDataOutput; + sint32 count8 = count >> 3; + sint32 countRemaining = count & 7; + + if (count8) + { + uint16x8_t mMin = vdupq_n_u16(0xFFFF); + uint16x8_t mMax = vdupq_n_u16(0x0000); + uint16x8_t mTemp; + uint16x8_t* mRawIndices = (uint16x8_t*) indicesU16BE; + indicesU16BE += count8 * 8; + uint16x8_t* mOutputIndices = (uint16x8_t*) indexOutput; + indexOutput += count8 * 8; + + while (count8--) + { + mTemp = vld1q_u16((uint16*)mRawIndices); + mRawIndices++; + mTemp = vrev16q_u8(mTemp); + mMin = vminq_u16(mMin, mTemp); + mMax = vmaxq_u16(mMax, mTemp); + vst1q_u16((uint16*)mOutputIndices, mTemp); + mOutputIndices++; + } + + uint16* mMaxU16 = (uint16*)&mMax; + uint16* mMinU16 = (uint16*)&mMin; + + for (int i = 0; i < 8; ++i) { + indexMax = std::max(indexMax, (uint32)mMaxU16[i]); + indexMin = std::min(indexMin, (uint32)mMinU16[i]); + } + } + // process remaining indices + uint32 _minIndex = 0xFFFFFFFF; + uint32 _maxIndex = 0; + for (sint32 i = countRemaining; (--i) >= 0;) + { + uint16 idx = _swapEndianU16(*indicesU16BE); + *indexOutput = idx; + indexOutput++; + indicesU16BE++; + _maxIndex = std::max(_maxIndex, (uint32)idx); + _minIndex = std::min(_minIndex, (uint32)idx); + } + // update min/max + indexMax = std::max(indexMax, _maxIndex); + indexMin = std::min(indexMin, _minIndex); +} + +void LatteIndices_fastConvertU32_NEON(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax) +{ + const uint32* indicesU32BE = (const uint32*)indexDataInput; + uint32* indexOutput = (uint32*)indexDataOutput; + sint32 count8 = count >> 2; + sint32 countRemaining = count & 3; + + if (count8) + { + uint32x4_t mMin = vdupq_n_u32(0xFFFFFFFF); + uint32x4_t mMax = vdupq_n_u32(0x00000000); + uint32x4_t mTemp; + uint32x4_t* mRawIndices = (uint32x4_t*) indicesU32BE; + indicesU32BE += count8 * 4; + uint32x4_t* mOutputIndices = (uint32x4_t*) indexOutput; + indexOutput += count8 * 4; + + while (count8--) + { + mTemp = vld1q_u32((uint32*)mRawIndices); + mRawIndices++; + mTemp = vrev32q_u8(mTemp); + mMin = vminq_u32(mMin, mTemp); + mMax = vmaxq_u32(mMax, mTemp); + vst1q_u32((uint32*)mOutputIndices, mTemp); + mOutputIndices++; + } + + uint32* mMaxU32 = (uint32*)&mMax; + uint32* mMinU32 = (uint32*)&mMin; + + for (int i = 0; i < 4; ++i) { + indexMax = std::max(indexMax, mMaxU32[i]); + indexMin = std::min(indexMin, mMinU32[i]); + } + } + // process remaining indices + uint32 _minIndex = 0xFFFFFFFF; + uint32 _maxIndex = 0; + for (sint32 i = countRemaining; (--i) >= 0;) + { + uint32 idx = _swapEndianU32(*indicesU32BE); + *indexOutput = idx; + indexOutput++; + indicesU32BE++; + _maxIndex = std::max(_maxIndex, idx); + _minIndex = std::min(_minIndex, idx); + } + // update min/max + indexMax = std::max(indexMax, _maxIndex); + indexMin = std::min(indexMin, _minIndex); +} + #endif template @@ -688,27 +798,31 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 { if (indexType == LatteIndexType::U16_BE) { - #if defined(ARCH_X86_64) +#if defined(ARCH_X86_64) if (g_CPUFeatures.x86.avx2) LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3) LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax); else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); - #else +#elif defined(__aarch64__) + LatteIndices_fastConvertU16_NEON(indexData, indexOutputPtr, count, indexMin, indexMax); +#else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); - #endif +#endif } else if (indexType == LatteIndexType::U32_BE) { - #if defined(ARCH_X86_64) +#if defined(ARCH_X86_64) if (g_CPUFeatures.x86.avx2) LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax); else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); - #else +#elif defined(__aarch64__) + LatteIndices_fastConvertU32_NEON(indexData, indexOutputPtr, count, indexMin, indexMax); +#else LatteIndices_convertBE(indexData, indexOutputPtr, count, indexMin, indexMax); - #endif +#endif } else if (indexType == LatteIndexType::U16_LE) { diff --git a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp index 870d1850a7..2eef929d67 100644 --- a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp +++ b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp @@ -25,7 +25,11 @@ void nnNfp_update(); namespace coreinit { +#ifdef __arm64__ + void __OSFiberThreadEntry(uint32, uint32); +#else void __OSFiberThreadEntry(void* thread); +#endif void __OSAddReadyThreadToRunQueue(OSThread_t* thread); void __OSRemoveThreadFromRunQueues(OSThread_t* thread); }; @@ -49,7 +53,7 @@ namespace coreinit struct OSHostThread { - OSHostThread(OSThread_t* thread) : m_thread(thread), m_fiber(__OSFiberThreadEntry, this, this) + OSHostThread(OSThread_t* thread) : m_thread(thread), m_fiber((void(*)(void*))__OSFiberThreadEntry, this, this) { } @@ -1304,8 +1308,14 @@ namespace coreinit __OSThreadStartTimeslice(hostThread->m_thread, &hostThread->ppcInstance); } +#ifdef __arm64__ + void __OSFiberThreadEntry(uint32 _high, uint32 _low) + { + uint64 _thread = (uint64) _high << 32 | _low; +#else void __OSFiberThreadEntry(void* _thread) { +#endif OSHostThread* hostThread = (OSHostThread*)_thread; #if defined(ARCH_X86_64) diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h index 9e5c60f5ee..996a13b3dc 100644 --- a/src/Common/precompiled.h +++ b/src/Common/precompiled.h @@ -310,7 +310,8 @@ inline uint64 __rdtsc() inline void _mm_mfence() { - + asm volatile("" ::: "memory"); + std::atomic_thread_fence(std::memory_order_seq_cst); } inline unsigned char _addcarry_u64(unsigned char carry, unsigned long long a, unsigned long long b, unsigned long long *result) diff --git a/src/gui/MainWindow.cpp b/src/gui/MainWindow.cpp index 2f63b46089..1d11702e7e 100644 --- a/src/gui/MainWindow.cpp +++ b/src/gui/MainWindow.cpp @@ -139,6 +139,7 @@ enum MAINFRAME_MENU_ID_DEBUG_VK_ACCURATE_BARRIERS, // debug->logging + MAINFRAME_MENU_ID_DEBUG_LOGGING_MESSAGE = 21499, MAINFRAME_MENU_ID_DEBUG_LOGGING0 = 21500, MAINFRAME_MENU_ID_DEBUG_ADVANCED_PPC_INFO = 21599, // debug->dump @@ -2213,7 +2214,7 @@ void MainWindow::RecreateMenu() debugLoggingMenu->AppendSeparator(); wxMenu* logCosModulesMenu = new wxMenu(); - logCosModulesMenu->AppendCheckItem(0, _("&Options below are for experts. Leave off if unsure"), wxEmptyString)->Enable(false); + logCosModulesMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING_MESSAGE, _("&Options below are for experts. Leave off if unsure"), wxEmptyString)->Enable(false); logCosModulesMenu->AppendSeparator(); logCosModulesMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::CoreinitFile), _("coreinit File-Access API"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::CoreinitFile)); logCosModulesMenu->AppendCheckItem(MAINFRAME_MENU_ID_DEBUG_LOGGING0 + stdx::to_underlying(LogType::CoreinitThreadSync), _("coreinit Thread-Synchronization API"), wxEmptyString)->Check(cemuLog_isLoggingEnabled(LogType::CoreinitThreadSync)); diff --git a/src/util/Fiber/FiberUnix.cpp b/src/util/Fiber/FiberUnix.cpp index 0d52706938..36430449e2 100644 --- a/src/util/Fiber/FiberUnix.cpp +++ b/src/util/Fiber/FiberUnix.cpp @@ -15,7 +15,12 @@ Fiber::Fiber(void(*FiberEntryPoint)(void* userParam), void* userParam, void* pri ctx->uc_stack.ss_sp = m_stackPtr; ctx->uc_stack.ss_size = stackSize; ctx->uc_link = &ctx[0]; +#ifdef __arm64__ + // https://www.man7.org/linux/man-pages/man3/makecontext.3.html#NOTES + makecontext(ctx, (void(*)())FiberEntryPoint, 2, (uint64) userParam >> 32, userParam); +#else makecontext(ctx, (void(*)())FiberEntryPoint, 1, userParam); +#endif this->m_implData = (void*)ctx; } diff --git a/src/util/MemMapper/MemMapperUnix.cpp b/src/util/MemMapper/MemMapperUnix.cpp index 0ade291d43..8e800e53cb 100644 --- a/src/util/MemMapper/MemMapperUnix.cpp +++ b/src/util/MemMapper/MemMapperUnix.cpp @@ -45,7 +45,11 @@ namespace MemMapper void* r; if(fromReservation) { - if( mprotect(baseAddr, size, GetProt(permissionFlags)) == 0 ) + uint64 page_size = sysconf(_SC_PAGESIZE); + void* page = baseAddr; + if ( (uint64) baseAddr % page_size != 0 ) + page = (void*) ((uint64)baseAddr & ~(page_size - 1)); + if( mprotect(page, size, GetProt(permissionFlags)) == 0 ) r = baseAddr; else r = nullptr; diff --git a/src/util/highresolutiontimer/HighResolutionTimer.cpp b/src/util/highresolutiontimer/HighResolutionTimer.cpp index 67ffa3492f..bb4a40ab3c 100644 --- a/src/util/highresolutiontimer/HighResolutionTimer.cpp +++ b/src/util/highresolutiontimer/HighResolutionTimer.cpp @@ -27,6 +27,8 @@ uint64 HighResolutionTimer::m_freq = []() -> uint64 { LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); return (uint64)(freq.QuadPart); +#elif BOOST_OS_MACOS + return 1000000000; #else timespec pc; clock_getres(CLOCK_MONOTONIC_RAW, &pc);