Skip to content

Commit 3fdcc77

Browse files
Adding transcribe mode (#92)
* Adding `hyde-transcribe` to update docs from version to version of hyde * updating hyde to llvm 18.1.8 * adding folder transcription for classes * Making `derive_transcription_src_path` generally available
1 parent b2693a2 commit 3fdcc77

9 files changed

+370
-52
lines changed

CMakeLists.txt

+11
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,21 @@ FetchContent_Declare(
3636
SOURCE_SUBDIR llvm
3737
)
3838

39+
FetchContent_Declare(
40+
diff
41+
GIT_REPOSITORY https://github.com/fosterbrereton/diff.git
42+
GIT_TAG 2ba1687a30de266416caa141f8be408e72843be0
43+
GIT_SHALLOW TRUE
44+
GIT_PROGRESS TRUE
45+
SOURCE_SUBDIR diff
46+
)
47+
3948
set(LLVM_ENABLE_PROJECTS "clang")
4049
set(LLVM_TARGETS_TO_BUILD "X86;AArch64")
4150
set(LLVM_ENABLE_ZSTD OFF)
4251

4352
FetchContent_MakeAvailable(llvm)
53+
FetchContent_MakeAvailable(diff)
4454

4555
message(STATUS "INFO: LLVM source dir: ${llvm_SOURCE_DIR}")
4656
message(STATUS "INFO: LLVM binary dir: ${llvm_BINARY_DIR}")
@@ -127,6 +137,7 @@ target_include_directories(hyde
127137
${llvm_BINARY_DIR}/tools/clang/include
128138
${llvm_SOURCE_DIR}/llvm/include
129139
${llvm_BINARY_DIR}/include
140+
${diff_SOURCE_DIR}/include
130141
)
131142

132143
target_compile_options(hyde

emitters/yaml_base_emitter.cpp

+150-26
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ json yaml_base_emitter::base_emitter_node(std::string layout,
260260
node["hyde"]["owner"] = default_tag_value;
261261
node["hyde"]["tags"].emplace_back(std::move(tag));
262262
node["hyde"]["brief"] = default_tag_value;
263+
node["hyde"]["version"] = hyde_version();
263264

264265
return node;
265266
}
@@ -392,6 +393,7 @@ void yaml_base_emitter::check_notify(const std::string& filepath,
392393
std::cerr << filepath << "@" << escaped_nodepath << "['" << escaped_key
393394
<< "']: " << validate_message << "\n";
394395
} break;
396+
case yaml_mode::transcribe:
395397
case yaml_mode::update: {
396398
std::cout << filepath << "@" << escaped_nodepath << "['" << escaped_key
397399
<< "']: " << update_message << "\n";
@@ -980,6 +982,78 @@ bool yaml_base_emitter::check_object_array(const std::string& filepath,
980982

981983
/**************************************************************************************************/
982984

985+
std::vector<std::string> object_keys(const json& j) {
986+
std::vector<std::string> result;
987+
988+
for (auto iter{j.begin()}, last{j.end()}; iter != last; ++iter) {
989+
result.push_back(static_cast<const std::string&>(iter.key()));
990+
}
991+
992+
return result;
993+
}
994+
995+
template <class T>
996+
inline void move_append(T& dst, T&& src) {
997+
dst.insert(dst.end(), std::make_move_iterator(src.begin()), std::make_move_iterator(src.end()));
998+
}
999+
1000+
struct transcribe_pair {
1001+
std::string src;
1002+
std::string dst;
1003+
};
1004+
1005+
using transcribe_pairs = std::vector<transcribe_pair>;
1006+
1007+
// This is O(N^2), where N is the size of both `src` and `dst`. Therefore transcription
1008+
// should only be run when it is shown to be necessary. At the same time, if your code base
1009+
// has enough overrides to really slow this algorithm down, hyde's performance is the least
1010+
// of your concerns.
1011+
transcribe_pairs derive_transcribe_pairs(const json& src, const json& dst) {
1012+
std::vector<std::string> src_keys = object_keys(src);
1013+
std::vector<std::string> dst_keys = object_keys(dst);
1014+
1015+
if (src_keys.size() != dst_keys.size()) {
1016+
std::cerr << "WARNING: transcription key count mismatch\n";
1017+
}
1018+
1019+
transcribe_pairs result;
1020+
1021+
while (!src_keys.empty()) {
1022+
transcribe_pair cur_pair;
1023+
1024+
// pop a key off the old name set
1025+
cur_pair.src = std::move(src_keys.back());
1026+
src_keys.pop_back();
1027+
1028+
// find the best match of the dst keys to the src key
1029+
std::size_t best_match = std::numeric_limits<std::size_t>::max();
1030+
std::size_t best_index = 0;
1031+
for (std::size_t i = 0; i < dst_keys.size(); ++i) {
1032+
// generate the diff score of the src key and the candidate dst
1033+
std::size_t cur_match = diff_score(cur_pair.src, dst_keys[i]);
1034+
1035+
if (cur_match > best_match) {
1036+
continue;
1037+
}
1038+
1039+
// if this dst candidate is better than what we've seen, remember that.
1040+
best_match = cur_match;
1041+
best_index = i;
1042+
}
1043+
1044+
// pair the best match dst and src keys and remove dst
1045+
cur_pair.dst = std::move(dst_keys[best_index]);
1046+
dst_keys.erase(dst_keys.begin() + best_index);
1047+
1048+
// save off the pair and repeat
1049+
result.emplace_back(std::move(cur_pair));
1050+
}
1051+
1052+
return result;
1053+
}
1054+
1055+
/**************************************************************************************************/
1056+
9831057
bool yaml_base_emitter::check_map(const std::string& filepath,
9841058
const json& have_node,
9851059
const json& expected_node,
@@ -1013,38 +1087,68 @@ bool yaml_base_emitter::check_map(const std::string& filepath,
10131087
}
10141088

10151089
const json& have = have_node[key];
1090+
bool failure{false};
1091+
json result_map;
10161092

1017-
std::vector<std::string> keys;
1018-
1019-
for (auto iter{have.begin()}, last{have.end()}; iter != last; ++iter) {
1020-
keys.push_back(static_cast<const std::string&>(iter.key()));
1021-
}
1022-
for (auto iter{expected.begin()}, last{expected.end()}; iter != last; ++iter) {
1023-
keys.push_back(static_cast<const std::string&>(iter.key()));
1024-
}
1093+
if (key == "overloads" && _mode == yaml_mode::transcribe) {
1094+
/*
1095+
It is common during the upgrade from one version of hyde to another that the underlying
1096+
clang tooling will output different symbol names for a given symbol (e.g., a namespace
1097+
may get removed or added.) Although the symbol is unchanged, because its `expected` name
1098+
differs from the `have` name, hyde will consider the symbols different, remove the old name
1099+
and insert the new one. This wipes out any previous documentation under the old name that
1100+
should have been migrated to the new name.
1101+
1102+
The solution here is very specialized. For the "overloads" key only, we gather the name
1103+
of each overload in both the `have` and `expected` set. We then pair them up according
1104+
to how well they match to one another (using the Meyers' string diff algorithm; two strings
1105+
with less "patchwork" between them are considered a better match). Ideally this results in
1106+
key pairs that represent the same symbol, just with different names. Then we call the
1107+
`proc` with `have[old_name]` and `expected[new_name]` which will migrate any documentation
1108+
from the old name to the new.
1109+
1110+
This capability assumes the overload count of both `have` and `expected` are the same.
1111+
If any new functions are created or removed between upgrades in the clang driver (e.g.,
1112+
a new compiler-generated routine is created and documented) that will have to be managed
1113+
manually. Assuming the count is the same, it also assumes there is a 1:1 mapping from the
1114+
set of old names to the set of new names. This implies the transcription mode should be
1115+
done as a separate step from an update. In other words, a transcription assumes the
1116+
documentation is actually the same between the `have` and `expected` sets, it is _just the
1117+
overload names_ that have changed, so map the old-named documentation to the new-named
1118+
documentation as reasonably as possible.
1119+
*/
1120+
for (const auto& pair : derive_transcribe_pairs(have, expected)) {
1121+
const std::string curnodepath = nodepath + "['" + pair.dst + "']";
1122+
failure |= proc(filepath, have[pair.src], expected[pair.dst], curnodepath,
1123+
result_map[pair.dst]);
1124+
}
1125+
} else {
1126+
std::vector<std::string> keys;
10251127

1026-
std::sort(keys.begin(), keys.end());
1027-
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
1128+
move_append(keys, object_keys(have));
1129+
move_append(keys, object_keys(expected));
10281130

1029-
bool failure{false};
1131+
std::sort(keys.begin(), keys.end());
1132+
keys.erase(std::unique(keys.begin(), keys.end()), keys.end());
10301133

1031-
json result_map;
1032-
for (const auto& subkey : keys) {
1033-
std::string curnodepath = nodepath + "['" + subkey + "']";
1134+
for (const auto& subkey : keys) {
1135+
const std::string curnodepath = nodepath + "['" + subkey + "']";
10341136

1035-
if (!expected.count(subkey)) {
1036-
// Issue #75: only remove non-root keys to allow non-hyde YAML into the file.
1037-
if (!at_root) {
1038-
notify("extraneous map key: `" + subkey + "`", "map key removed: `" + subkey + "`");
1137+
if (!expected.count(subkey)) {
1138+
// Issue #75: only remove non-root keys to allow non-hyde YAML into the file.
1139+
if (!at_root) {
1140+
notify("extraneous map key: `" + subkey + "`",
1141+
"map key removed: `" + subkey + "`");
1142+
failure = true;
1143+
}
1144+
} else if (!have.count(subkey)) {
1145+
notify("map key missing: `" + subkey + "`", "map key inserted: `" + subkey + "`");
1146+
result_map[subkey] = expected[subkey];
10391147
failure = true;
1148+
} else {
1149+
failure |=
1150+
proc(filepath, have[subkey], expected[subkey], curnodepath, result_map[subkey]);
10401151
}
1041-
} else if (!have.count(subkey)) {
1042-
notify("map key missing: `" + subkey + "`", "map key inserted: `" + subkey + "`");
1043-
result_map[subkey] = expected[subkey];
1044-
failure = true;
1045-
} else {
1046-
failure |=
1047-
proc(filepath, have[subkey], expected[subkey], curnodepath, result_map[subkey]);
10481152
}
10491153
}
10501154

@@ -1103,6 +1207,24 @@ std::pair<bool, json> yaml_base_emitter::merge(const std::string& filepath,
11031207
check_editable_scalar(filepath, have_hyde, expected_hyde, "", merged_hyde, "brief");
11041208
failure |= check_scalar_array(filepath, have_hyde, expected_hyde, "", merged_hyde, "tags");
11051209

1210+
// We don't want to use `check_scalar` on the version key. If the versions mismatch its not
1211+
// necessarily a validation error (as the docs may match OK), but something we want to warn
1212+
// about. Then in transcription/update we want to hard-set the value to the version of this
1213+
// tool.
1214+
1215+
switch (_mode) {
1216+
case yaml_mode::validate: {
1217+
if (!have_hyde.count("version") ||
1218+
static_cast<const std::string&>(have_hyde["version"]) != hyde_version()) {
1219+
std::cerr << "INFO: Validation phase with a mismatched version of hyde. Consider updating then/or transcribing.\n";
1220+
}
1221+
} break;
1222+
case yaml_mode::update:
1223+
case yaml_mode::transcribe: {
1224+
merged_hyde["version"] = hyde_version();
1225+
} break;
1226+
}
1227+
11061228
failure |= do_merge(filepath, have_hyde, expected_hyde, merged_hyde);
11071229
}
11081230

@@ -1264,7 +1386,7 @@ documentation parse_documentation(const std::filesystem::path& path, bool fixup_
12641386
const auto front_matter_end = contents_end + front_matter_end_k.size();
12651387
std::string yaml_src = have_contents.substr(0, front_matter_end);
12661388
have_contents.erase(0, front_matter_end);
1267-
1389+
12681390
result._remainder = std::move(have_contents);
12691391
result._json = yaml_to_json(load_yaml(path));
12701392

@@ -1342,6 +1464,7 @@ bool yaml_base_emitter::reconcile(json expected,
13421464
case hyde::yaml_mode::validate: {
13431465
// do nothing
13441466
} break;
1467+
case hyde::yaml_mode::transcribe:
13451468
case hyde::yaml_mode::update: {
13461469
failure = write_documentation({std::move(merged), std::move(remainder)}, path);
13471470
} break;
@@ -1354,6 +1477,7 @@ bool yaml_base_emitter::reconcile(json expected,
13541477
std::cerr << relative_path << ": required file does not exist\n";
13551478
failure = true;
13561479
} break;
1480+
case hyde::yaml_mode::transcribe:
13571481
case hyde::yaml_mode::update: {
13581482
// Add update. No remainder yet, as above.
13591483
// REVISIT: Refactor all this into a call to write_documentation,

emitters/yaml_class_emitter.cpp

+16-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ written permission of Adobe.
1717

1818
// application
1919
#include "emitters/yaml_function_emitter.hpp"
20+
#include "matchers/utilities.hpp"
2021

2122
/**************************************************************************************************/
2223

@@ -108,19 +109,32 @@ bool yaml_class_emitter::emit(const json& j, json& out_emitted, const json& inhe
108109

109110
auto dst = dst_path(j, static_cast<const std::string&>(j["name"]));
110111

112+
if (_mode == yaml_mode::transcribe && !exists(dst)) {
113+
// In this case the symbol name has changed, which has caused a change to the directory name
114+
// we are now trying to load and reconcile with what we've created. In this case, we can
115+
// assume the "shape" of the documentation is the same, which means that within the parent
116+
// folder of `dst` is the actual source folder that holds the old documentation, just under
117+
// a different name. Find that folder and rename it.
118+
119+
std::filesystem::rename(derive_transcription_src_path(dst, node["title"]), dst);
120+
}
121+
111122
bool failure =
112123
reconcile(std::move(node), _dst_root, std::move(dst) / index_filename_k, out_emitted);
113124

114-
const auto& methods = j["methods"];
115125
yaml_function_emitter function_emitter(_src_root, _dst_root, _mode, _options, true);
126+
auto emitted_methods = hyde::json::array();
127+
const auto& methods = j["methods"];
116128

117129
for (auto it = methods.begin(); it != methods.end(); ++it) {
118130
function_emitter.set_key(it.key());
119131
auto function_emitted = hyde::json::object();
120132
failure |= function_emitter.emit(it.value(), function_emitted, out_emitted.at("hyde"));
121-
out_emitted["methods"].push_back(std::move(function_emitted));
133+
emitted_methods.push_back(std::move(function_emitted));
122134
}
123135

136+
out_emitted["methods"] = std::move(emitted_methods);
137+
124138
return failure;
125139
}
126140

emitters/yaml_function_emitter.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ written permission of Adobe.
1515
// stdc++
1616
#include <iostream>
1717

18+
// application
19+
#include "matchers/utilities.hpp"
20+
1821
/**************************************************************************************************/
1922

2023
namespace hyde {
@@ -256,6 +259,16 @@ bool yaml_function_emitter::emit(const json& jsn, json& out_emitted, const json&
256259
if (is_ctor) node["hyde"]["is_ctor"] = true;
257260
if (is_dtor) node["hyde"]["is_dtor"] = true;
258261

262+
if (_mode == yaml_mode::transcribe && !exists(dst)) {
263+
// In this case the symbol name has changed, which has caused a change to the directory name
264+
// we are now trying to load and reconcile with what we've created. In this case, we can
265+
// assume the "shape" of the documentation is the same, which means that within the parent
266+
// folder of `dst` is the actual source folder that holds the old documentation, just under
267+
// a different name. Find that folder and rename it.
268+
269+
std::filesystem::rename(derive_transcription_src_path(dst, node["title"]), dst);
270+
}
271+
259272
return reconcile(std::move(node), _dst_root, dst / (filename + ".md"), out_emitted);
260273
}
261274

include/output_yaml.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ namespace hyde {
2525
/**************************************************************************************************/
2626

2727
enum class yaml_mode {
28-
validate,
29-
update,
28+
validate, // ensure the destination docs match the shape of the generated docs
29+
update, // update the destination docs to match the shape of the generated docs
30+
transcribe // transcribe the destination docs to match the symbols put out by upgraded tooling
3031
};
3132

3233
/**************************************************************************************************/

0 commit comments

Comments
 (0)