Skip to content

Commit e6c6c94

Browse files
hhugoclaude
andcommitted
Simplify Extended_ast vs Std_ast
Make Extended_ast the single entry point for parsing, validation, and AST operations. Std_ast becomes a purely internal implementation detail. Extended_ast.t GADT now uses paired wrapper types that embed both the extended and standard ASTs: `('a, 'b) paired = {extended: 'a; std: 'b}`. For the 7 OCaml fragment types (Structure, Signature, Use_file, Core_type, Module_type, Expression, Pattern), parsing produces both ASTs in one operation. Repl_file and Documentation have no std counterpart. Main changes: - Add `('a, 'b) paired` record type to Extended_ast, used by the GADT for fragments with std parser support - Extended_ast.Parse.ast now internally calls Std_ast.Parse.ast to produce the paired result. A `paired` helper factors out the common parse-normalize-pair pattern. - Absorb Parse_with_comments into Extended_ast: - `Extended_ast.parse` is the full parsing pipeline (warnings, hash-bang, tokens, Source.t creation) - `Extended_ast.parse_toplevel` dispatches Use_file vs Repl_file - `Extended_ast.Parsed.t` replaces `Parse_with_comments.with_comments` - `Extended_ast.Warning50` replaces `Parse_with_comments.Warning50` - Delete Parse_with_comments.ml/.mli - Documentation bypasses the OCaml parsing pipeline entirely (no hash-bang handling, no w50, no token collection) - Add `Extended_ast.equivalent` to check AST preservation using the embedded std ASTs, returning `Ast_preserved | Docstrings_moved _ | Ast_changed`. Includes a TODO for Repl_file/Documentation which currently skip the check. - Add `Extended_ast.dump` and `dump_normalized` for debug output, falling back to extended AST printing for fragments without std - Break Normalize_std_ast -> Normalize_extended_ast dependency by making `normalize_code` a parameter instead of an import - Simplify Translation_unit: remove `std_fg`/`std_parsed` parameters from `format` and `parse_and_format`, remove `(type std)`, use `Extended_ast.equivalent` for validation. Translation_unit no longer references Std_ast directly. - Clean up Std_ast: remove `of_syntax`, `any_t`, `Repl_file`, `Documentation` constructors, comment collection from Parse.ast - Remove unused `Extended_ast.equal`, `equal_core_type`, `Normalize_extended_ast.equal`, `Normalize_extended_ast.ast` - Break Source -> Extended_ast cycle (Source.ml only needs Parsetree) - Document why Std_ast exists: the standard parser is used to verify formatting preserves semantics (two programs are equivalent if the compiler parses them identically) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 206c15a commit e6c6c94

16 files changed

Lines changed: 498 additions & 440 deletions

lib/Extended_ast.ml

Lines changed: 279 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,22 @@
1212
open Ocamlformat_parser_extended
1313
include Parsetree
1414

15-
let equal_core_type : core_type -> core_type -> bool = Poly.equal
16-
1715
type use_file = toplevel_phrase list
1816

1917
type repl_file = repl_phrase list
2018

19+
module Std_parsetree = Ocamlformat_parser_standard.Parsetree
20+
21+
type ('a, 'b) paired = {extended: 'a; std: 'b}
22+
2123
type 'a t =
22-
| Structure : structure t
23-
| Signature : signature t
24-
| Use_file : use_file t
25-
| Core_type : core_type t
26-
| Module_type : module_type t
27-
| Expression : expression t
28-
| Pattern : pattern t
24+
| Structure : (structure, Std_parsetree.structure) paired t
25+
| Signature : (signature, Std_parsetree.signature) paired t
26+
| Use_file : (use_file, Std_parsetree.toplevel_phrase list) paired t
27+
| Core_type : (core_type, Std_parsetree.core_type) paired t
28+
| Module_type : (module_type, Std_parsetree.module_type) paired t
29+
| Expression : (expression, Std_parsetree.expression) paired t
30+
| Pattern : (pattern, Std_parsetree.pattern) paired t
2931
| Repl_file : repl_file t
3032
| Documentation : Ocamlformat_odoc_parser.Ast.t t
3133

@@ -42,17 +44,16 @@ let of_syntax = function
4244
| Repl_file -> Any Repl_file
4345
| Documentation -> Any Documentation
4446

45-
let equal (type a) (_ : a t) : a -> a -> bool = Poly.equal
46-
4747
let map (type a) (x : a t) (m : Ast_mapper.mapper) : a -> a =
4848
match x with
49-
| Structure -> m.structure m
50-
| Signature -> m.signature m
51-
| Use_file -> List.map ~f:(m.toplevel_phrase m)
52-
| Core_type -> m.typ m
53-
| Module_type -> m.module_type m
54-
| Expression -> m.expr m
55-
| Pattern -> m.pat m
49+
| Structure -> fun v -> {v with extended= m.structure m v.extended}
50+
| Signature -> fun v -> {v with extended= m.signature m v.extended}
51+
| Use_file ->
52+
fun v -> {v with extended= List.map ~f:(m.toplevel_phrase m) v.extended}
53+
| Core_type -> fun v -> {v with extended= m.typ m v.extended}
54+
| Module_type -> fun v -> {v with extended= m.module_type m v.extended}
55+
| Expression -> fun v -> {v with extended= m.expr m v.extended}
56+
| Pattern -> fun v -> {v with extended= m.pat m v.extended}
5657
| Repl_file -> List.map ~f:(m.repl_phrase m)
5758
| Documentation -> Fn.id
5859

@@ -346,23 +347,51 @@ module Parse = struct
346347

347348
let ast (type a) (fg : a t) ~ocaml_version ~preserve_beginend
348349
~prefer_let_puns ~input_name str : a =
349-
map fg
350-
(normalize_mapper ~ocaml_version ~preserve_beginend ~prefer_let_puns)
351-
@@
350+
let nm =
351+
normalize_mapper ~ocaml_version ~preserve_beginend ~prefer_let_puns
352+
in
352353
let lexbuf = Lexing.from_string str in
353-
let ocaml_version =
354+
let ocaml_version_pair =
354355
Some Ocaml_version.(major ocaml_version, minor ocaml_version)
355356
in
356357
Location.init_info lexbuf input_name ;
358+
let parse_std std_fg =
359+
(* Suppress warnings during raw std parse to avoid duplicate w50
360+
warnings — w50 handling is done separately by parse_ocaml *)
361+
Warning.with_warning_filter
362+
~filter_warning:(fun _loc _warn -> false)
363+
~filter_alert:(fun _loc _alert -> false)
364+
~f:(fun () -> Std_ast.Parse.ast std_fg ~ocaml_version ~input_name str)
365+
in
366+
let paired normalize parse_ext std_fg =
367+
let extended =
368+
normalize nm (parse_ext ~ocaml_version:ocaml_version_pair lexbuf)
369+
in
370+
{extended; std= parse_std std_fg}
371+
in
357372
match fg with
358-
| Structure -> Parse.implementation ~ocaml_version lexbuf
359-
| Signature -> Parse.interface ~ocaml_version lexbuf
360-
| Use_file -> Parse.use_file ~ocaml_version lexbuf
361-
| Core_type -> Parse.core_type ~ocaml_version lexbuf
362-
| Module_type -> Parse.module_type ~ocaml_version lexbuf
363-
| Expression -> Parse.expression ~ocaml_version lexbuf
364-
| Pattern -> Parse.pattern ~ocaml_version lexbuf
365-
| Repl_file -> Toplevel_lexer.repl_file ~ocaml_version lexbuf
373+
| Structure ->
374+
paired
375+
(fun nm -> nm.structure nm)
376+
Parse.implementation Std_ast.Structure
377+
| Signature ->
378+
paired (fun nm -> nm.signature nm) Parse.interface Std_ast.Signature
379+
| Use_file ->
380+
paired
381+
(fun nm -> List.map ~f:(nm.toplevel_phrase nm))
382+
Parse.use_file Std_ast.Use_file
383+
| Core_type ->
384+
paired (fun nm -> nm.typ nm) Parse.core_type Std_ast.Core_type
385+
| Module_type ->
386+
paired
387+
(fun nm -> nm.module_type nm)
388+
Parse.module_type Std_ast.Module_type
389+
| Expression ->
390+
paired (fun nm -> nm.expr nm) Parse.expression Std_ast.Expression
391+
| Pattern -> paired (fun nm -> nm.pat nm) Parse.pattern Std_ast.Pattern
392+
| Repl_file ->
393+
List.map ~f:(nm.repl_phrase nm)
394+
(Toplevel_lexer.repl_file ~ocaml_version:ocaml_version_pair lexbuf)
366395
| Documentation ->
367396
let pos = (Location.curr lexbuf).loc_start in
368397
let pos = {pos with pos_fname= input_name} in
@@ -377,13 +406,13 @@ module Printast = struct
377406
let repl_file = Format.pp_print_list repl_phrase
378407

379408
let ast (type a) : a t -> _ -> a -> _ = function
380-
| Structure -> implementation
381-
| Signature -> interface
382-
| Use_file -> use_file
383-
| Core_type -> core_type
384-
| Module_type -> module_type
385-
| Expression -> expression
386-
| Pattern -> pattern
409+
| Structure -> fun fmt v -> implementation fmt v.extended
410+
| Signature -> fun fmt v -> interface fmt v.extended
411+
| Use_file -> fun fmt v -> use_file fmt v.extended
412+
| Core_type -> fun fmt v -> core_type fmt v.extended
413+
| Module_type -> fun fmt v -> module_type fmt v.extended
414+
| Expression -> fun fmt v -> expression fmt v.extended
415+
| Pattern -> fun fmt v -> pattern fmt v.extended
387416
| Repl_file -> repl_file
388417
| Documentation -> Docstring.dump
389418
end
@@ -395,3 +424,216 @@ module Asttypes = struct
395424

396425
let is_recursive = function Recursive -> true | Nonrecursive -> false
397426
end
427+
428+
type std_value = Std_value : 'a Std_ast.t * 'a -> std_value
429+
430+
let get_std (type a) (fg : a t) (v : a) : std_value option =
431+
match fg with
432+
| Structure -> Some (Std_value (Structure, v.std))
433+
| Signature -> Some (Std_value (Signature, v.std))
434+
| Use_file -> Some (Std_value (Use_file, v.std))
435+
| Core_type -> Some (Std_value (Core_type, v.std))
436+
| Module_type -> Some (Std_value (Module_type, v.std))
437+
| Expression -> Some (Std_value (Expression, v.std))
438+
| Pattern -> Some (Std_value (Pattern, v.std))
439+
| Repl_file -> None
440+
| Documentation -> None
441+
442+
type std_pair = Std_pair : 'a Std_ast.t * 'a * 'a -> std_pair
443+
444+
let get_std_pair (type a) (fg : a t) (v1 : a) (v2 : a) : std_pair option =
445+
match fg with
446+
| Structure -> Some (Std_pair (Structure, v1.std, v2.std))
447+
| Signature -> Some (Std_pair (Signature, v1.std, v2.std))
448+
| Use_file -> Some (Std_pair (Use_file, v1.std, v2.std))
449+
| Core_type -> Some (Std_pair (Core_type, v1.std, v2.std))
450+
| Module_type -> Some (Std_pair (Module_type, v1.std, v2.std))
451+
| Expression -> Some (Std_pair (Expression, v1.std, v2.std))
452+
| Pattern -> Some (Std_pair (Pattern, v1.std, v2.std))
453+
| Repl_file -> None
454+
| Documentation -> None
455+
456+
let dump (type a) (fg : a t) fmt (v : a) =
457+
match get_std fg v with
458+
| Some (Std_value (std_fg, std_v)) -> Std_ast.Printast.ast std_fg fmt std_v
459+
| None -> Printast.ast fg fmt v
460+
461+
let dump_normalized (type a) (fg : a t) ~normalize_code conf fmt (v : a) =
462+
match get_std fg v with
463+
| Some (Std_value (std_fg, std_v)) ->
464+
Std_ast.Printast.ast std_fg fmt
465+
(Normalize_std_ast.ast std_fg ~normalize_code conf std_v)
466+
| None -> Printast.ast fg fmt v
467+
468+
type ast_check_result =
469+
| Ast_preserved
470+
| Docstrings_moved of Cmt.error list
471+
| Ast_changed
472+
473+
let equivalent (type a) (fg : a t) ~normalize_code conf (old_v : a)
474+
(new_v : a) : ast_check_result =
475+
match get_std_pair fg old_v new_v with
476+
| None ->
477+
(* TODO: Repl_file and Documentation have no std AST, so we skip the
478+
equivalence check.
479+
- Repl_file: each toplevel phrase is OCaml code that could be
480+
validated individually by parsing it with the standard parser.
481+
- Documentation: OCaml code blocks inside .mld files are formatted
482+
but never validated for AST preservation. We should check each
483+
formatted code block by parsing it with the standard parser and
484+
comparing. *)
485+
Ast_preserved
486+
| Some (Std_pair (std_fg, old_std, new_std)) ->
487+
if
488+
Normalize_std_ast.equal std_fg ~normalize_code
489+
~ignore_doc_comments:(not conf.Conf.opr_opts.comment_check.v)
490+
conf old_std new_std
491+
then Ast_preserved
492+
else if
493+
Normalize_std_ast.equal std_fg ~normalize_code
494+
~ignore_doc_comments:true conf old_std new_std
495+
then
496+
Docstrings_moved
497+
(Normalize_std_ast.moved_docstrings ~normalize_code std_fg conf
498+
old_std new_std )
499+
else Ast_changed
500+
501+
module Parsed = struct
502+
type 'a t =
503+
{ast: 'a; comments: Cmt.t list; prefix: string; source: Source.t}
504+
end
505+
506+
exception Warning50 of (Location.t * Warnings.t) list
507+
508+
module W = struct
509+
type t = int
510+
511+
let in_lexer : t list = [1; 2; 3; 14; 29]
512+
513+
let disable x = -abs x
514+
515+
let enable x = abs x
516+
517+
let to_string x =
518+
String.concat ~sep:"" (List.map ~f:(Format.sprintf "%+d") x)
519+
end
520+
521+
let tokens lexbuf =
522+
let rec loop acc =
523+
match Lexer.token_with_comments lexbuf with
524+
(* The location in lexbuf are invalid for comments *)
525+
| COMMENT (_, loc) as tok -> loop ((tok, loc) :: acc)
526+
| DOCSTRING ds as tok -> loop ((tok, Docstrings.docstring_loc ds) :: acc)
527+
| tok -> (
528+
let loc = Migrate_ast.Location.of_lexbuf lexbuf in
529+
let acc = (tok, loc) :: acc in
530+
match tok with EOF -> List.rev acc | _ -> loop acc )
531+
in
532+
loop []
533+
534+
let fresh_lexbuf source =
535+
let lexbuf = Lexing.from_string source in
536+
Location.init_info lexbuf !Location.input_name ;
537+
let hash_bang =
538+
Lexer.skip_hash_bang lexbuf ;
539+
let len = lexbuf.lex_last_pos in
540+
String.sub source ~pos:0 ~len
541+
in
542+
(lexbuf, hash_bang)
543+
544+
let split_hash_bang source =
545+
let lexbuf = Lexing.from_string source in
546+
Location.init_info lexbuf !Location.input_name ;
547+
Lexer.skip_hash_bang lexbuf ;
548+
let len = lexbuf.lex_last_pos in
549+
let hash_bang = String.sub source ~pos:0 ~len in
550+
let rest = String.sub source ~pos:len ~len:(String.length source - len) in
551+
(rest, hash_bang)
552+
553+
let collect_comments () =
554+
List.map (Lexer.comments ()) ~f:(function
555+
| `Comment txt, loc -> Cmt.create_comment txt loc
556+
| `Docstring txt, loc -> Cmt.create_docstring txt loc )
557+
558+
let parse_ocaml ?(disable_w50 = false) ?(disable_deprecated = false) fg
559+
(conf : Conf.t) ~input_name ~source =
560+
let warnings =
561+
if conf.opr_opts.quiet.v then List.map ~f:W.disable W.in_lexer else []
562+
in
563+
let warnings = if disable_w50 then warnings else W.enable 50 :: warnings in
564+
ignore @@ Warnings.parse_options false (W.to_string warnings) ;
565+
let w50 = ref [] in
566+
let t =
567+
let source, hash_bang = split_hash_bang source in
568+
Warning.with_warning_filter
569+
~filter_warning:(fun loc warn ->
570+
if
571+
Warning.is_unexpected_docstring warn
572+
&& conf.opr_opts.comment_check.v
573+
then (
574+
w50 := (loc, warn) :: !w50 ;
575+
false )
576+
else not conf.opr_opts.quiet.v )
577+
~filter_alert:(fun _loc alert ->
578+
if Warning.is_deprecated_alert alert && disable_deprecated then false
579+
else not conf.opr_opts.quiet.v )
580+
~f:(fun () ->
581+
let ocaml_version = conf.opr_opts.ocaml_version.v in
582+
let preserve_beginend =
583+
Poly.(conf.fmt_opts.exp_grouping.v = `Preserve)
584+
in
585+
let prefer_let_puns =
586+
match conf.fmt_opts.letop_punning.v with
587+
| `Always -> Some true
588+
| `Never -> Some false
589+
| `Preserve -> None
590+
in
591+
let ast =
592+
Parse.ast fg ~ocaml_version ~preserve_beginend ~prefer_let_puns
593+
~input_name source
594+
in
595+
let comments = collect_comments () in
596+
Warnings.check_fatal () ;
597+
let tokens =
598+
let lexbuf, _ = fresh_lexbuf source in
599+
tokens lexbuf
600+
in
601+
let source = Source.create ~text:source ~tokens in
602+
{Parsed.ast; comments; prefix= hash_bang; source} )
603+
in
604+
match List.rev !w50 with [] -> t | w50 -> raise (Warning50 w50)
605+
606+
let parse (type a) ?disable_w50 ?disable_deprecated (fg : a t) conf
607+
~input_name ~source : a Parsed.t =
608+
match fg with
609+
| Documentation ->
610+
let pos = {Lexing.dummy_pos with pos_fname= input_name} in
611+
let ast = Docstring.parse_file pos source in
612+
{ Parsed.ast
613+
; comments= []
614+
; prefix= ""
615+
; source= Source.create ~text:source ~tokens:[] }
616+
| fg ->
617+
parse_ocaml ?disable_w50 ?disable_deprecated fg conf ~input_name
618+
~source
619+
620+
(** [is_repl_block x] returns whether [x] is a list of REPL phrases and
621+
outputs of the form:
622+
623+
{v
624+
# let this is = some phrase;;
625+
this is some output
626+
v} *)
627+
let is_repl_block x =
628+
String.length x >= 2 && Char.equal x.[0] '#' && Char.is_whitespace x.[1]
629+
630+
let parse_toplevel ?disable_w50 ?disable_deprecated (conf : Conf.t)
631+
~input_name ~source =
632+
if is_repl_block source && conf.fmt_opts.parse_toplevel_phrases.v then
633+
Either.Second
634+
(parse ?disable_w50 ?disable_deprecated Repl_file conf ~input_name
635+
~source )
636+
else
637+
First
638+
(parse ?disable_w50 ?disable_deprecated Use_file conf ~input_name
639+
~source )

0 commit comments

Comments
 (0)