-
Notifications
You must be signed in to change notification settings - Fork 8
Replace a single loop, and a few changes are needed for the small kernel PR #473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
ab0e899
3bd41e1
a14c823
998eb20
7013325
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -84,70 +84,23 @@ void adjrxt(Real rate[rxntot], const Real inv[nfs], const Real m) { | |
| // TODO: the lines of concern *kind of* bear resemblance to the similarly | ||
| // concerning lines in linmat(), though it's difficult to tell if that results | ||
| // in consistent units | ||
| KOKKOS_INLINE_FUNCTION | ||
| void imp_prod_loss(Real prod[clscnt4], Real loss[clscnt4], Real y[gas_pcnst], | ||
| const Real rxt[rxntot], const Real het_rates[gas_pcnst]) { | ||
| template <typename VectorType> | ||
| KOKKOS_INLINE_FUNCTION void | ||
| imp_prod_loss(Real prod[clscnt4], Real loss[clscnt4], VectorType &y, | ||
| const Real rxt[rxntot], const Real het_rates[gas_pcnst]) { | ||
| const Real zero = 0; | ||
| loss[0] = (+het_rates[1] + rxt[0] + rxt[2]) * (+y[1]); | ||
| loss[0] = (het_rates[1] + rxt[0] + rxt[2]) * y[1]; | ||
| prod[0] = zero; | ||
| loss[1] = (+het_rates[2]) * (+y[2]); | ||
| prod[1] = (+rxt[3]) * (+y[3]); | ||
| loss[2] = (+het_rates[3] + rxt[3]) * (+y[3]); | ||
| prod[2] = (+rxt[4] + 0.500000 * rxt[5] + rxt[6]) * (+y[4]); | ||
| loss[3] = (+het_rates[4] + rxt[4] + rxt[5] + rxt[6]) * (+y[4]); | ||
| loss[1] = het_rates[2] * y[2]; | ||
| prod[1] = rxt[3] * y[3]; | ||
| loss[2] = (het_rates[3] + rxt[3]) * y[3]; | ||
| prod[2] = (rxt[4] + 0.500000 * rxt[5] + rxt[6]) * y[4]; | ||
| loss[3] = (het_rates[4] + rxt[4] + rxt[5] + rxt[6]) * y[4]; | ||
| prod[3] = zero; | ||
| loss[4] = (+het_rates[5]) * (+y[5]); | ||
| prod[4] = zero; | ||
| loss[5] = (+het_rates[6]) * (+y[6]); | ||
| prod[5] = zero; | ||
| loss[6] = (+het_rates[7]) * (+y[7]); | ||
| prod[6] = zero; | ||
| loss[7] = (+het_rates[8]) * (+y[8]); | ||
| prod[7] = zero; | ||
| loss[8] = (+het_rates[9]) * (+y[9]); | ||
| prod[8] = zero; | ||
| loss[9] = (+het_rates[10]) * (+y[10]); | ||
| prod[9] = zero; | ||
| loss[10] = (+het_rates[11]) * (+y[11]); | ||
| prod[10] = zero; | ||
| loss[11] = (+het_rates[12]) * (+y[12]); | ||
| prod[11] = zero; | ||
| loss[12] = (+het_rates[13]) * (+y[13]); | ||
| prod[12] = zero; | ||
| loss[13] = (+het_rates[14]) * (+y[14]); | ||
| prod[13] = zero; | ||
| loss[14] = (+het_rates[15]) * (+y[15]); | ||
| prod[14] = zero; | ||
| loss[15] = (+het_rates[16]) * (+y[16]); | ||
| prod[15] = zero; | ||
| loss[16] = (+het_rates[17]) * (+y[17]); | ||
| prod[16] = zero; | ||
| loss[17] = (+het_rates[18]) * (+y[18]); | ||
| prod[17] = zero; | ||
| loss[18] = (+het_rates[19]) * (+y[19]); | ||
| prod[18] = zero; | ||
| loss[19] = (+het_rates[20]) * (+y[20]); | ||
| prod[19] = zero; | ||
| loss[20] = (+het_rates[21]) * (+y[21]); | ||
| prod[20] = zero; | ||
| loss[21] = (+het_rates[22]) * (+y[22]); | ||
| prod[21] = zero; | ||
| loss[22] = (+het_rates[23]) * (+y[23]); | ||
| prod[22] = zero; | ||
| loss[23] = (+het_rates[24]) * (+y[24]); | ||
| prod[23] = zero; | ||
| loss[24] = (+het_rates[25]) * (+y[25]); | ||
| prod[24] = zero; | ||
| loss[25] = (+het_rates[26]) * (+y[26]); | ||
| prod[25] = zero; | ||
| loss[26] = (+het_rates[27]) * (+y[27]); | ||
| prod[26] = zero; | ||
| loss[27] = (+het_rates[28]) * (+y[28]); | ||
| prod[27] = zero; | ||
| loss[28] = (+het_rates[29]) * (+y[29]); | ||
| prod[28] = zero; | ||
| loss[29] = (+het_rates[30]) * (+y[30]); | ||
| prod[29] = zero; | ||
| for (int i = 4; i < 31; ++i) { | ||
| loss[i] = het_rates[i + 1] * y[i + 1]; | ||
| prod[i] = zero; | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for removing the redundant
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be |
||
| } // imp_prod_loss | ||
|
|
||
| KOKKOS_INLINE_FUNCTION | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -173,15 +173,14 @@ void set_ub_col(Real &o3_col_delta, | |
| KOKKOS_INLINE_FUNCTION | ||
| void setcol(const ThreadTeam &team, const Real o3_col_deltas[mam4::nlev + 1], | ||
| ColumnView &o3_col_dens) { | ||
| // we can probably accelerate this with a parallel_scan, but let's just do | ||
| // a simple loop for now | ||
| constexpr int nlev = mam4::nlev; | ||
| Kokkos::single(Kokkos::PerTeam(team), [=]() { | ||
| o3_col_dens(0) = 0.5 * (o3_col_deltas[0] + o3_col_deltas[1]); | ||
| for (int k = 1; k < nlev; ++k) { | ||
| o3_col_dens(k) = | ||
| o3_col_dens(k - 1) + 0.5 * (o3_col_deltas[k] + o3_col_deltas[k + 1]); | ||
| } | ||
| Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nlev), [&](int kk) { | ||
| Kokkos::parallel_reduce( | ||
| Kokkos::ThreadVectorRange(team, kk + 1), | ||
| [&](int i, Real &lsum) { | ||
| lsum += 0.5 * (o3_col_deltas[i] + o3_col_deltas[i + 1]); | ||
| }, | ||
| o3_col_dens(kk)); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it not possible to replace these two nested Kokkos loops with a single Kokkos::parallel_scan?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it is possible. Does
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not know the internal implementation of Is this code based on
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not 100% sure if this routine is from chemistry/mozart/mo_photo.F90. @singhbalwinder or @jeff-cohere, do you recall if we directly ported this routine from the Fortran code? If not, I believe we should update it.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's the commit for the original code, annotated by And here's the log message for that commit: It would be good to check this against the original Fortran code, of course!
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And here's the original code. As @overfelt has pointed out, there's an incorrect factor of 1/2 in there, so it should definitely be updated: Sorry for the bug!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, @jeff-cohere, and no worries. @singhbalwinder, I will fix this bug in a follow-up PR because this fix will be NBFB.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue: #482
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, all! Yes, we should fix this in a follow-up PR. I also think we should use parallel_scan here instead of the nested loops, if possible. |
||
| }); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ using namespace mo_sethet; | |
| void sethet(Ensemble *ensemble) { | ||
| ensemble->process([=](const Input &input, Output &output) { | ||
| using View1DHost = typename HostType::view_1d<Real>; | ||
| using View2DHost = typename HostType::view_2d<Real>; | ||
| using ColumnView = haero::ColumnView; | ||
| constexpr int pver = mam4::nlev; | ||
| constexpr int gas_pcnst = mam4::gas_chemistry::gas_pcnst; | ||
|
|
@@ -101,34 +102,32 @@ void sethet(Ensemble *ensemble) { | |
| so2_diss = haero::testing::create_column_view(pver); | ||
|
|
||
| ColumnView tmp_hetrates[gas_pcnst]; | ||
| ColumnView qin[gas_pcnst]; | ||
| View1DHost tmp_hetrates_host[gas_pcnst]; | ||
| View1DHost qin_host[gas_pcnst]; | ||
| View2DHost qin_host("qin_host", pver, gas_pcnst); | ||
|
|
||
| View2D het_rates("het_rates", pver, gas_pcnst); | ||
| View2D qin("qin", pver, gas_pcnst); | ||
| auto het_rates_host = Kokkos::create_mirror_view(het_rates); | ||
|
|
||
| for (int mm = 0; mm < gas_pcnst; ++mm) { | ||
|
|
||
| tmp_hetrates[mm] = haero::testing::create_column_view(pver); | ||
| qin[mm] = haero::testing::create_column_view(pver); | ||
| tmp_hetrates_host[mm] = View1DHost("tmp_hetrates_host", pver); | ||
| qin_host[mm] = View1DHost("qin_host", pver); | ||
| } | ||
|
|
||
| int count = 0; | ||
| for (int mm = 0; mm < gas_pcnst; ++mm) { | ||
| for (int kk = 0; kk < pver; ++kk) { | ||
| qin_host[mm](kk) = qin_in[count]; | ||
| qin_host(kk, mm) = qin_in[count]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whenever we are switching between arrays and views, we should test it on Frontier so that the model doesn't break on it. |
||
| count++; | ||
| } | ||
| } | ||
|
|
||
| // transfer data to GPU. | ||
| for (int mm = 0; mm < gas_pcnst; ++mm) { | ||
| Kokkos::deep_copy(tmp_hetrates[mm], 0.0); | ||
| Kokkos::deep_copy(qin[mm], qin_host[mm]); | ||
| } | ||
| Kokkos::deep_copy(qin, qin_host); | ||
|
|
||
| auto team_policy = ThreadTeamPolicy(1u, Kokkos::AUTO); | ||
| Kokkos::parallel_for( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.