···2020 | Invalid_flow_indentation (** Content in flow collection must be indented *)
2121 | Tab_in_indentation
2222 | Invalid_block_scalar_header of string
2323+ | Invalid_quoted_scalar_indentation of string
2324 | Invalid_directive of string
2425 | Invalid_yaml_version of string
2526 | Invalid_tag_directive of string
···121122 | Tab_in_indentation -> "tab character in indentation"
122123 | Invalid_block_scalar_header s ->
123124 Printf.sprintf "invalid block scalar header: %s" s
125125+ | Invalid_quoted_scalar_indentation s ->
126126+ Printf.sprintf "%s" s
124127 | Invalid_directive s -> Printf.sprintf "invalid directive: %s" s
125128 | Invalid_yaml_version s -> Printf.sprintf "invalid YAML version: %s" s
126129 | Invalid_tag_directive s -> Printf.sprintf "invalid TAG directive: %s" s
+39-3
yaml/ocaml-yamle/lib/parser.ml
···66 | Implicit_document_start
77 | Document_start
88 | Document_content
99+ | Document_content_done (* After parsing a node, check for unexpected content *)
910 | Document_end
1011 | Block_node
1112 | Block_node_or_indentless_sequence
···3637 mutable tag_directives : (string * string) list;
3738 mutable current_token : Token.spanned option;
3839 mutable finished : bool;
4040+ mutable explicit_doc_end : bool; (** True if last doc ended with explicit ... *)
4141+ mutable stream_start : bool; (** True if we haven't emitted any documents yet *)
3942}
40434144let create scanner = {
···5053 ];
5154 current_token = None;
5255 finished = false;
5656+ explicit_doc_end = false;
5757+ stream_start = true;
5358}
54595560let of_string s = create (Scanner.of_string s)
···208213 | None -> Span.point Position.initial
209214 in
210215216216+ (* After first document, stream_start is false *)
217217+ t.stream_start <- false;
211218 push_state t Document_end;
212219 t.state <- Document_content;
213220 Event.Document_start { version = t.version; implicit }, span
···222229223230 if not implicit then skip_token t;
224231232232+ (* Track if this document ended explicitly with ... *)
233233+ t.explicit_doc_end <- not implicit;
225234 t.state <- Implicit_document_start;
226235 Event.Document_end { implicit }, span
227236···599608 empty_scalar_event ~anchor:None ~tag:None tok.span
600609601610(** Main state machine dispatcher *)
602602-let parse t =
611611+let rec parse t =
603612 match t.state with
604613 | Stream_start ->
605614 parse_stream_start t
···607616 | Implicit_document_start ->
608617 (* Skip any document end markers before checking what's next *)
609618 while check t (function Token.Document_end -> true | _ -> false) do
619619+ t.explicit_doc_end <- true; (* Seeing ... counts as explicit end *)
610620 skip_token t
611621 done;
612622···617627 t.state <- End;
618628 t.finished <- true;
619629 Event.Stream_end, tok.span
620620- | Token.Version_directive _ | Token.Tag_directive _ | Token.Document_start ->
630630+ | Token.Version_directive _ | Token.Tag_directive _ ->
631631+ (* Directives are only allowed at stream start or after explicit ... (MUS6/01) *)
632632+ if not t.stream_start && not t.explicit_doc_end then
633633+ Error.raise_span tok.span (Invalid_directive "directives require explicit document end '...' before them");
634634+ parse_document_start t ~implicit:false
635635+ | Token.Document_start ->
621636 parse_document_start t ~implicit:false
622637 (* These tokens are invalid at document start - they indicate leftover junk *)
623638 | Token.Flow_sequence_end | Token.Flow_mapping_end | Token.Flow_entry
···638653 let tok = current_token t in
639654 t.state <- pop_state t;
640655 empty_scalar_event ~anchor:None ~tag:None tok.span
641641- end else
656656+ end else begin
657657+ (* Push Document_content_done so we return there after parsing the node.
658658+ This allows us to check for unexpected content after the node. *)
659659+ push_state t Document_content_done;
642660 parse_node t ~block:true ~indentless:false
661661+ end
662662+663663+ | Document_content_done ->
664664+ (* After parsing a node in document content, check for unexpected content *)
665665+ if check t (function
666666+ | Token.Version_directive _ | Token.Tag_directive _
667667+ | Token.Document_start | Token.Document_end | Token.Stream_end -> true
668668+ | _ -> false)
669669+ then begin
670670+ (* Valid document boundary - continue to Document_end *)
671671+ t.state <- pop_state t;
672672+ parse t (* Continue to emit the next event *)
673673+ end else begin
674674+ (* Unexpected content after document value - this is an error (KS4U, BS4K) *)
675675+ let tok = current_token t in
676676+ Error.raise_span tok.span
677677+ (Unexpected_token "content not allowed after document value")
678678+ end
643679644680 | Document_end ->
645681 parse_document_end t
+75-12
yaml/ocaml-yamle/lib/scanner.ml
···9999 done
100100 end
101101102102-(** Skip blanks (spaces/tabs) and return whether tabs were found *)
102102+(** Skip blanks (spaces/tabs) and return (found_tabs, found_spaces) *)
103103let skip_blanks_check_tabs t =
104104 let found_tab = ref false in
105105+ let found_space = ref false in
105106 while Input.next_is_blank t.input do
106106- if Input.peek t.input = Some '\t' then found_tab := true;
107107+ (match Input.peek t.input with
108108+ | Some '\t' -> found_tab := true
109109+ | Some ' ' -> found_space := true
110110+ | _ -> ());
107111 ignore (Input.next t.input)
108112 done;
109109- !found_tab
113113+ (!found_tab, !found_space)
110114111115(** Skip whitespace and comments, return true if at newline *)
112116let rec skip_to_next_token t =
···140144 Input.consume_break t.input;
141145 (* Allow simple keys after line breaks in flow context *)
142146 t.allow_simple_key <- true;
147147+ (* After line break in flow, check for tabs at start of line (Y79Y/03)
148148+ Tabs are not allowed as indentation - if tab is first char and results
149149+ in a column less than flow_indent, it's an error *)
150150+ if Input.next_is (( = ) '\t') t.input then begin
151151+ (* Tab at start of line in flow context - skip tabs and check position *)
152152+ let start_mark = Input.mark t.input in
153153+ while Input.next_is (( = ) '\t') t.input do
154154+ ignore (Input.next t.input)
155155+ done;
156156+ (* If only tabs were used (no spaces) and column < flow_indent, error *)
157157+ if not (Input.next_is_break t.input) && not (Input.is_eof t.input) &&
158158+ column t < t.flow_indent then
159159+ Error.raise_at start_mark Invalid_flow_indentation
160160+ end;
143161 skip_to_next_token t
144162 end else begin
145163 ignore (Input.next t.input);
···414432 (* Check for document boundary *)
415433 if Input.at_document_boundary t.input then
416434 Error.raise_at start Unclosed_single_quote;
435435+ (* Check indentation: continuation must be > block indent (QB6E, DK95) *)
436436+ let col = column t in
437437+ let indent = current_indent t in
438438+ if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
439439+ Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
417440 (* Count empty lines (consecutive line breaks) *)
418441 let empty_lines = ref 0 in
419442 while Input.next_is_break t.input do
···423446 ignore (Input.next t.input)
424447 done;
425448 if Input.at_document_boundary t.input then
426426- Error.raise_at start Unclosed_single_quote
449449+ Error.raise_at start Unclosed_single_quote;
450450+ (* Check indentation after each empty line too *)
451451+ let col = column t in
452452+ let indent = current_indent t in
453453+ if not (Input.is_eof t.input) && not (Input.next_is_break t.input) && col <= indent && indent >= 0 then
454454+ Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar")
427455 done;
428456 (* Apply folding rules *)
429457 if !empty_lines > 0 then begin
···552580 (* Count consecutive line breaks (empty lines) *)
553581 let empty_lines = ref 0 in
554582 let continue = ref true in
583583+ let started_with_tab = ref false in
555584 while !continue do
585585+ (* Track if we start with a tab (for DK95/01 check) *)
586586+ if Input.next_is (( = ) '\t') t.input then started_with_tab := true;
556587 (* Skip blanks (spaces/tabs) on the line *)
557588 while Input.next_is_blank t.input do
558589 ignore (Input.next t.input)
···560591 (* Check if we hit another line break (empty line) *)
561592 if Input.next_is_break t.input then begin
562593 Input.consume_break t.input;
563563- incr empty_lines
594594+ incr empty_lines;
595595+ started_with_tab := false (* Reset for next line *)
564596 end else
565597 continue := false
566598 done;
567599 (* Check for document boundary - this terminates the quoted string *)
568600 if Input.at_document_boundary t.input then
569601 Error.raise_at start Unclosed_double_quote;
602602+ (* Check indentation: continuation must be > block indent (QB6E, DK95)
603603+ Note: must be strictly greater than block indent, not just equal *)
604604+ let col = column t in
605605+ let indent = current_indent t in
606606+ let start_col = start.column in
607607+ (* DK95/01: if continuation started with tabs and column < start column, error *)
608608+ if not (Input.is_eof t.input) && !started_with_tab && col < start_col then
609609+ Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
610610+ if not (Input.is_eof t.input) && col <= indent && indent >= 0 then
611611+ Error.raise_at (Input.mark t.input) (Invalid_quoted_scalar_indentation "invalid indentation in quoted scalar");
570612 (* Per YAML spec: single break = space, break + empty lines = newlines *)
571613 if !empty_lines > 0 then begin
572614 (* Empty lines: output N newlines where N = number of empty lines *)
···775817 let buf = Buffer.create 256 in
776818 let trailing_breaks = Buffer.create 16 in
777819 let leading_blank = ref false in (* Was the previous line "more indented"? *)
820820+ let max_empty_line_indent = ref 0 in (* Track max indent of empty lines before first content *)
778821779822 (* Skip to content indentation, skipping empty lines.
780823 Returns the number of spaces actually skipped (important for detecting dedentation). *)
···843886 match Input.peek_nth t.input (!idx) with
844887 | None | Some '\n' | Some '\r' ->
845888 (* Line has only spaces - empty line *)
889889+ (* Track max indent of empty lines for later validation *)
890890+ if !idx > !max_empty_line_indent then
891891+ max_empty_line_indent := !idx;
846892 while Input.next_is (( = ) ' ') t.input do
847893 ignore (Input.next t.input)
848894 done;
···852898 | _ ->
853899 (* Has content (including tabs which are content, not indentation) *)
854900 0
901901+ end else if Input.next_is (( = ) '\t') t.input then begin
902902+ (* Tab at start of line in implicit indent mode - this is an error (Y79Y)
903903+ because tabs cannot be used as indentation in YAML *)
904904+ Error.raise_at (Input.mark t.input) Tab_in_indentation
855905 end else
856856- (* Not at break or space - could be tab (content) or other *)
906906+ (* Not at break or space - other content character *)
857907 0
858908 end
859909 in
···887937 if line_indent <= base_level then
888938 false (* No content - first line not indented enough *)
889939 else begin
940940+ (* Validate: first content line must be indented at least as much as
941941+ the maximum indent seen on empty lines before it (5LLU, S98Z, W9L4) *)
942942+ if line_indent < !max_empty_line_indent && line_indent > base_level then
943943+ Error.raise_at (Input.mark t.input)
944944+ (Invalid_block_scalar_header "wrongly indented line in block scalar");
890945 content_indent := line_indent;
891946 true
892947 end
···10111066 while Input.next_is_digit t.input do
10121067 minor := !minor * 10 + (Char.code (Input.next_exn t.input) - Char.code '0')
10131068 done;
10691069+ (* Validate: only whitespace and comments allowed before line break (MUS6) *)
10701070+ skip_whitespace_and_comment t;
10711071+ if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
10721072+ Error.raise_at (Input.mark t.input) (Invalid_directive "expected comment or line break after version");
10141073 let span = Span.make ~start ~stop:(Input.mark t.input) in
10151074 Token.Version_directive { major = !major; minor = !minor }, span
10161075···12071266 ignore (Input.next t.input);
1208126712091268 (* Check for tabs after - : pattern like -\t- is invalid *)
12101210- let found_tabs = skip_blanks_check_tabs t in
12691269+ let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
12111270 if found_tabs then begin
12121271 (* If we found tabs and next char is - followed by whitespace, error *)
12131272 match Input.peek t.input with
···12481307 ignore (Input.next t.input);
1249130812501309 (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *)
12511251- let found_tabs = skip_blanks_check_tabs t in
13101310+ let (found_tabs, _found_spaces) = skip_blanks_check_tabs t in
12521311 if found_tabs && t.flow_level = 0 then begin
12531312 (* In block context, tabs after ? are not allowed *)
12541313 Error.raise_at start Tab_in_indentation
···13441403 let start = Input.mark t.input in
13451404 ignore (Input.next t.input);
1346140513471347- (* Check for tabs after : : pattern like :\t- is invalid in block context *)
13481348- let found_tabs = skip_blanks_check_tabs t in
13491349- if found_tabs && t.flow_level = 0 then begin
13501350- (* In block context, tabs after : followed by indicator are not allowed *)
14061406+ (* Check for tabs after : : patterns like :\t- or :\tkey: are invalid in block context (Y79Y/09)
14071407+ However, :\t bar (tab followed by space then content) is valid (6BCT) *)
14081408+ let (found_tabs, found_spaces) = skip_blanks_check_tabs t in
14091409+ if found_tabs && not found_spaces && t.flow_level = 0 then begin
14101410+ (* In block context, tabs-only after : followed by indicator or alphanumeric are not allowed *)
13511411 match Input.peek t.input with
13521412 | Some ('-' | '?') ->
14131413+ Error.raise_at start Tab_in_indentation
14141414+ | Some c when (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ->
14151415+ (* Tab-only followed by alphanumeric - likely a key, which is invalid *)
13531416 Error.raise_at start Tab_in_indentation
13541417 | _ -> ()
13551418 end;