···1111 | Invalid_tag of string
1212 | Invalid_anchor of string
1313 | Invalid_alias of string
1414+ | Invalid_comment
1415 | Unclosed_single_quote
1516 | Unclosed_double_quote
1617 | Unclosed_flow_sequence
1718 | Unclosed_flow_mapping
1819 | Invalid_indentation of int * int (** expected, got *)
2020+ | Invalid_flow_indentation (** Content in flow collection must be indented *)
1921 | Tab_in_indentation
2022 | Invalid_block_scalar_header of string
2123 | Invalid_directive of string
···106108 | Invalid_tag s -> Printf.sprintf "invalid tag: %s" s
107109 | Invalid_anchor s -> Printf.sprintf "invalid anchor: %s" s
108110 | Invalid_alias s -> Printf.sprintf "invalid alias: %s" s
111111+ | Invalid_comment -> "comments must be separated from other tokens by whitespace"
109112 | Unclosed_single_quote -> "unclosed single quote"
110113 | Unclosed_double_quote -> "unclosed double quote"
111114 | Unclosed_flow_sequence -> "unclosed flow sequence '['"
112115 | Unclosed_flow_mapping -> "unclosed flow mapping '{'"
113116 | Invalid_indentation (expected, got) ->
114117 Printf.sprintf "invalid indentation: expected %d, got %d" expected got
118118+ | Invalid_flow_indentation -> "invalid indentation in flow construct"
115119 | Tab_in_indentation -> "tab character in indentation"
116120 | Invalid_block_scalar_header s ->
117121 Printf.sprintf "invalid block scalar header: %s" s
+5
yaml/ocaml-yamle/lib/input.ml
···144144145145(** Mark current position for span creation *)
146146let mark t = t.position
147147+148148+(** Get the character before the current position *)
149149+let peek_back t =
150150+ if t.pos <= 0 then None
151151+ else Some t.source.[t.pos - 1]
···2424 mutable stream_ended : bool;
2525 mutable indent_stack : indent list; (** Stack of indentation levels *)
2626 mutable flow_level : int; (** Nesting depth in [] or {} *)
2727+ mutable flow_indent : int; (** Column where outermost flow collection started *)
2728 mutable simple_keys : simple_key option list; (** Per flow-level simple key tracking *)
2829 mutable allow_simple_key : bool;
3030+ mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *)
3131+ mutable document_has_content : bool; (** True if we've emitted content tokens in current document *)
2932}
30333134let create input =
···3841 stream_ended = false;
3942 indent_stack = [];
4043 flow_level = 0;
4444+ flow_indent = 0;
4145 simple_keys = [None]; (* One entry for the base level *)
4246 allow_simple_key = true;
4747+ leading_whitespace = true; (* Start at beginning of stream *)
4848+ document_has_content = false;
4349 }
44504551let of_string s = create (Input.of_string s)
···6066 | [] -> 0
6167 | { indent; _ } :: _ -> indent
62686363-(** Skip whitespace and comments, return true if at newline *)
6464-let rec skip_to_next_token t =
6565- (* Skip blanks *)
6969+(** Skip whitespace to end of line, checking for valid comments.
7070+ Returns true if any whitespace (including tabs) was found before a comment. *)
7171+let skip_whitespace_and_comment t =
7272+ let has_whitespace = ref false in
7373+ (* Skip blanks (spaces and tabs) *)
6674 while Input.next_is_blank t.input do
7575+ has_whitespace := true;
6776 ignore (Input.next t.input)
6877 done;
6969- (* Skip comment *)
7878+ (* Check for comment *)
7079 if Input.next_is (( = ) '#') t.input then begin
8080+ (* Validate: comment must be preceded by whitespace or be at start of line *)
8181+ if not !has_whitespace then begin
8282+ (* Check if we're at the start of input or after a line break *)
8383+ match Input.peek_back t.input with
8484+ | None -> () (* Start of input - OK *)
8585+ | Some c when Input.is_break c -> () (* After line break - OK *)
8686+ | _ ->
8787+ (* Comment not preceded by whitespace - ERROR *)
8888+ Error.raise_at (Input.mark t.input) Invalid_comment
8989+ end;
9090+ (* Skip to end of line *)
7191 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
7292 ignore (Input.next t.input)
7393 done
7474- end;
9494+ end
9595+9696+(** Skip blanks (spaces/tabs) and return whether tabs were found *)
9797+let skip_blanks_check_tabs t =
9898+ let found_tab = ref false in
9999+ while Input.next_is_blank t.input do
100100+ if Input.peek t.input = Some '\t' then found_tab := true;
101101+ ignore (Input.next t.input)
102102+ done;
103103+ !found_tab
104104+105105+(** Skip whitespace and comments, return true if at newline *)
106106+let rec skip_to_next_token t =
107107+ (* Check for tabs used as indentation in block context *)
108108+ (match Input.peek t.input with
109109+ | Some '\t' when t.flow_level = 0 && t.leading_whitespace &&
110110+ (column t - 1) <= current_indent t ->
111111+ (* Tab found in indentation zone - this is invalid *)
112112+ (* Skip to end of line to check if line has content *)
113113+ let start_pos = Input.mark t.input in
114114+ while Input.next_is_blank t.input do
115115+ ignore (Input.next t.input)
116116+ done;
117117+ (* If we have content on this line with a tab, raise error *)
118118+ if not (Input.next_is_break t.input) && not (Input.is_eof t.input) then
119119+ Error.raise_at start_pos Tab_in_indentation
120120+ | _ -> ());
121121+122122+ (* Skip blanks and validate comments *)
123123+ skip_whitespace_and_comment t;
75124 (* Skip line break in block context *)
76125 if t.flow_level = 0 && Input.next_is_break t.input then begin
77126 Input.consume_break t.input;
78127 t.allow_simple_key <- true;
128128+ t.leading_whitespace <- true;
79129 skip_to_next_token t
80130 end
81131 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
···297347 while Input.next_is_blank t.input do
298348 ignore (Input.next t.input)
299349 done;
350350+ (* Check for document boundary - this terminates the quoted string *)
351351+ if Input.at_document_boundary t.input then
352352+ Error.raise_at start Unclosed_single_quote;
300353 loop ()
301354 | Some c ->
302355 Buffer.add_char buf c;
···414467 end else
415468 continue := false
416469 done;
470470+ (* Check for document boundary - this terminates the quoted string *)
471471+ if Input.at_document_boundary t.input then
472472+ Error.raise_at start Unclosed_double_quote;
417473 (* Per YAML spec: single break = space, break + empty lines = newlines *)
418474 if !empty_lines > 0 then begin
419475 (* Empty lines: output N newlines where N = number of empty lines *)
···444500 | Some c2 when in_flow && Input.is_flow_indicator c2 -> false
445501 | _ -> true)
446502 | '#' ->
447447- (* # is OK if not preceded by whitespace (checked at call site) *)
448448- false
503503+ (* # is a comment indicator only if preceded by whitespace *)
504504+ (* Check the previous character to determine if this is a comment *)
505505+ (match Input.peek_back t.input with
506506+ | None -> true (* At start - can't be comment indicator, allow it *)
507507+ | Some c when Input.is_whitespace c -> false (* Preceded by whitespace - comment *)
508508+ | Some c when Input.is_break c -> false (* At start of line - comment *)
509509+ | _ -> true) (* Not preceded by whitespace - part of scalar *)
449510 | c when in_flow && Input.is_flow_indicator c -> false
450511 | _ when Input.is_break c -> false
451512 | _ -> true
···455516 let start = Input.mark t.input in
456517 let in_flow = t.flow_level > 0 in
457518 let indent = current_indent t in
519519+ (* Validate flow collection indentation *)
520520+ if in_flow && (column t) < t.flow_indent then
521521+ Error.raise_at start Invalid_flow_indentation;
458522 let buf = Buffer.create 64 in
459523 let spaces = Buffer.create 16 in
460524 let leading_blanks = ref false in
···463527 match Input.peek t.input with
464528 | None -> ()
465529 | Some c when can_continue_plain t c ~in_flow ->
466466- (* Check for # preceded by space *)
467467- if c = '#' && Buffer.length buf > 0 then
468468- () (* Stop - # after content *)
469469- else begin
530530+ (* can_continue_plain already handles # correctly - it returns false
531531+ when # is preceded by whitespace (making it a comment indicator) *)
532532+ begin
470533 if Buffer.length spaces > 0 then begin
471534 if !leading_blanks then begin
472535 (* Fold line break *)
···567630 chomping := Chomping.Keep; ignore (Input.next t.input)
568631 | _ -> ());
569632570570- (* Skip to end of line *)
571571- while Input.next_is_blank t.input do
572572- ignore (Input.next t.input)
573573- done;
574574-575575- (* Optional comment *)
576576- if Input.next_is (( = ) '#') t.input then begin
577577- while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
578578- ignore (Input.next t.input)
579579- done
580580- end;
633633+ (* Skip whitespace and optional comment *)
634634+ skip_whitespace_and_comment t;
581635582636 (* Consume line break *)
583637 if Input.next_is_break t.input then
···843897 Note: we use col, not col-1, to allow entries at the same level. *)
844898 unroll_indent t col;
845899900900+ (* We're about to process actual content, not leading whitespace *)
901901+ t.leading_whitespace <- false;
902902+846903 if Input.is_eof t.input then
847904 fetch_stream_end t
848905 else if Input.at_document_boundary t.input then
···901958 Input.skip t.input 3;
902959 let span = Span.make ~start ~stop:(Input.mark t.input) in
903960 let token = if indicator = "---" then Token.Document_start else Token.Document_end in
961961+ (* Reset document content flag after document end marker *)
962962+ if indicator = "..." then
963963+ t.document_has_content <- false;
904964 emit t span token
905965906966and fetch_directive t =
967967+ (* Directives can only appear:
968968+ 1. At stream start (before any document content)
969969+ 2. After a document end marker (...)
970970+ If we've emitted content in the current document, we need a document end marker first *)
971971+ if t.document_has_content then
972972+ Error.raise_at (Input.mark t.input)
973973+ (Unexpected_token "directives must be separated from document content by document end marker (...)");
907974 unroll_indent t (-1);
908975 remove_simple_key t;
909976 t.allow_simple_key <- false;
···912979913980and fetch_flow_collection_start t token_type =
914981 save_simple_key t;
982982+ (* Record indent of outermost flow collection *)
983983+ if t.flow_level = 0 then
984984+ t.flow_indent <- column t;
915985 t.flow_level <- t.flow_level + 1;
916986 t.allow_simple_key <- true;
917987 t.simple_keys <- None :: t.simple_keys;
988988+ t.document_has_content <- true;
918989 let start = Input.mark t.input in
919990 ignore (Input.next t.input);
920991 let span = Span.make ~start ~stop:(Input.mark t.input) in
···9561027 end;
9571028 remove_simple_key t;
9581029 t.allow_simple_key <- true;
10301030+ t.document_has_content <- true;
9591031 let start = Input.mark t.input in
9601032 ignore (Input.next t.input);
10331033+10341034+ (* Check for tabs after - : pattern like -\t- is invalid *)
10351035+ let found_tabs = skip_blanks_check_tabs t in
10361036+ if found_tabs then begin
10371037+ (* If we found tabs and next char is - followed by whitespace, error *)
10381038+ match Input.peek t.input with
10391039+ | Some '-' ->
10401040+ (match Input.peek_nth t.input 1 with
10411041+ | None -> Error.raise_at start Tab_in_indentation
10421042+ | Some c when Input.is_whitespace c ->
10431043+ Error.raise_at start Tab_in_indentation
10441044+ | Some _ -> ())
10451045+ | _ -> ()
10461046+ end;
10471047+9611048 let span = Span.make ~start ~stop:(Input.mark t.input) in
9621049 emit t span Token.Block_entry
9631050···9801067 end;
9811068 remove_simple_key t;
9821069 t.allow_simple_key <- t.flow_level = 0;
10701070+ t.document_has_content <- true;
9831071 let start = Input.mark t.input in
9841072 ignore (Input.next t.input);
10731073+10741074+ (* Check for tabs after ? : pattern like ?\t- or ?\tkey is invalid *)
10751075+ let found_tabs = skip_blanks_check_tabs t in
10761076+ if found_tabs && t.flow_level = 0 then begin
10771077+ (* In block context, tabs after ? are not allowed *)
10781078+ Error.raise_at start Tab_in_indentation
10791079+ end;
10801080+9851081 let span = Span.make ~start ~stop:(Input.mark t.input) in
9861082 emit t span Token.Key
9871083···10411137 end
10421138 end);
10431139 remove_simple_key t;
10441044- t.allow_simple_key <- t.flow_level = 0;
11401140+ (* In block context, allow_simple_key becomes true only after a line break,
11411141+ not immediately after ':'. This prevents constructs like "key: - a".
11421142+ The line break handling in skip_to_next_token will set it to true. *)
11431143+ t.allow_simple_key <- false;
11441144+ t.document_has_content <- true;
10451145 let start = Input.mark t.input in
10461146 ignore (Input.next t.input);
11471147+11481148+ (* Check for tabs after : : pattern like :\t- is invalid in block context *)
11491149+ let found_tabs = skip_blanks_check_tabs t in
11501150+ if found_tabs && t.flow_level = 0 then begin
11511151+ (* In block context, tabs after : followed by indicator are not allowed *)
11521152+ match Input.peek t.input with
11531153+ | Some ('-' | '?') ->
11541154+ Error.raise_at start Tab_in_indentation
11551155+ | _ -> ()
11561156+ end;
11571157+10471158 let span = Span.make ~start ~stop:(Input.mark t.input) in
10481159 emit t span Token.Value
1049116010501161and fetch_alias t =
10511162 save_simple_key t;
10521163 t.allow_simple_key <- false;
11641164+ t.document_has_content <- true;
10531165 let start = Input.mark t.input in
10541166 ignore (Input.next t.input); (* consume * *)
10551167 let name, span = scan_anchor_alias t in
···10591171and fetch_anchor t =
10601172 save_simple_key t;
10611173 t.allow_simple_key <- false;
11741174+ t.document_has_content <- true;
10621175 let start = Input.mark t.input in
10631176 ignore (Input.next t.input); (* consume & *)
10641177 let name, span = scan_anchor_alias t in
···10681181and fetch_tag t =
10691182 save_simple_key t;
10701183 t.allow_simple_key <- false;
11841184+ t.document_has_content <- true;
10711185 let handle, suffix, span = scan_tag t in
10721186 emit t span (Token.Tag { handle; suffix })
1073118710741188and fetch_block_scalar t literal =
10751189 remove_simple_key t;
10761190 t.allow_simple_key <- true;
11911191+ t.document_has_content <- true;
10771192 let value, style, span = scan_block_scalar t literal in
10781193 emit t span (Token.Scalar { style; value })
1079119410801195and fetch_single_quoted t =
10811196 save_simple_key t;
10821197 t.allow_simple_key <- false;
11981198+ t.document_has_content <- true;
10831199 let value, span = scan_single_quoted t in
10841200 emit t span (Token.Scalar { style = Scalar_style.Single_quoted; value })
1085120110861202and fetch_double_quoted t =
10871203 save_simple_key t;
10881204 t.allow_simple_key <- false;
12051205+ t.document_has_content <- true;
10891206 let value, span = scan_double_quoted t in
10901207 emit t span (Token.Scalar { style = Scalar_style.Double_quoted; value })
10911208···11061223and fetch_plain_scalar t =
11071224 save_simple_key t;
11081225 t.allow_simple_key <- false;
12261226+ t.document_has_content <- true;
11091227 let value, span = scan_plain_scalar t in
11101228 emit t span (Token.Scalar { style = Scalar_style.Plain; value })
11111229