···127127 Error.raise_span tok.span (Invalid_yaml_version "duplicate YAML directive");
128128 t.version <- Some (major, minor)
129129 | Token.Tag_directive { handle; prefix } ->
130130- if List.mem_assoc handle t.tag_directives &&
131131- handle <> "!" && handle <> "!!" then
132132- Error.raise_span tok.span (Invalid_tag_directive ("duplicate tag handle: " ^ handle));
133133- t.tag_directives <- (handle, prefix) :: t.tag_directives
130130+ (* Skip empty tag directives (these are reserved/unknown directives that were ignored) *)
131131+ if handle = "" && prefix = "" then
132132+ () (* Ignore reserved directives *)
133133+ else begin
134134+ if List.mem_assoc handle t.tag_directives &&
135135+ handle <> "!" && handle <> "!!" then
136136+ Error.raise_span tok.span (Invalid_tag_directive ("duplicate tag handle: " ^ handle));
137137+ t.tag_directives <- (handle, prefix) :: t.tag_directives
138138+ end
134139 | _ -> ()
135140 done
136141···458463 let tok = current_token t in
459464 match tok.token with
460465 | Token.Flow_sequence_end ->
461461- t.state <- Flow_sequence_entry;
462462- empty_scalar_event ~anchor:None ~tag:None tok.span
466466+ (* Trailing comma case - don't emit empty scalar, just go back to sequence entry state *)
467467+ skip_token t;
468468+ t.state <- pop_state t;
469469+ Event.Sequence_end, tok.span
463470 | Token.Flow_entry ->
464471 (* Double comma or comma after comma - invalid *)
465472 Error.raise_span tok.span (Unexpected_token "unexpected ',' in flow sequence")
466473 | Token.Key ->
467474 skip_token t;
468468- push_state t Flow_sequence_entry_mapping_end;
475475+ t.state <- Flow_sequence_entry_mapping_key;
476476+ Event.Mapping_start {
477477+ anchor = None; tag = None;
478478+ implicit = true;
479479+ style = Layout_style.Flow;
480480+ }, tok.span
481481+ | Token.Value ->
482482+ (* Implicit empty key mapping: [ : value ] *)
469483 t.state <- Flow_sequence_entry_mapping_key;
470484 Event.Mapping_start {
471485 anchor = None; tag = None;
···587601 parse_stream_start t
588602589603 | Implicit_document_start ->
604604+ (* Skip any document end markers before checking what's next *)
605605+ while check t (function Token.Document_end -> true | _ -> false) do
606606+ skip_token t
607607+ done;
608608+590609 let tok = current_token t in
591610 (match tok.token with
592611 | Token.Stream_end ->
+184-60
yaml/ocaml-yamle/lib/scanner.ml
···2929 mutable allow_simple_key : bool;
3030 mutable leading_whitespace : bool; (** True when at start of line (only whitespace seen) *)
3131 mutable document_has_content : bool; (** True if we've emitted content tokens in current document *)
3232+ mutable adjacent_value_allowed_at : Position.t option; (** Position where adjacent : is allowed *)
3333+ mutable pending_value : bool; (** True if we've emitted a KEY and are waiting for VALUE *)
3234}
33353436let create input =
···4648 allow_simple_key = true;
4749 leading_whitespace = true; (* Start at beginning of stream *)
4850 document_has_content = false;
5151+ adjacent_value_allowed_at = None;
5252+ pending_value = false;
4953 }
50545155let of_string s = create (Input.of_string s)
···6367(** Get current indent level *)
6468let current_indent t =
6569 match t.indent_stack with
6666- | [] -> 0
7070+ | [] -> -1
6771 | { indent; _ } :: _ -> indent
68726973(** Skip whitespace to end of line, checking for valid comments.
···7983 if Input.next_is (( = ) '#') t.input then begin
8084 (* Validate: comment must be preceded by whitespace or be at start of line *)
8185 if not !has_whitespace then begin
8282- (* Check if we're at the start of input or after a line break *)
8686+ (* Check if we're at the start of input or after whitespace (blank or line break) *)
8387 match Input.peek_back t.input with
8488 | None -> () (* Start of input - OK *)
8585- | Some c when Input.is_break c -> () (* After line break - OK *)
8989+ | Some c when Input.is_whitespace c -> () (* After whitespace - OK *)
8690 | _ ->
8791 (* Comment not preceded by whitespace - ERROR *)
8892 Error.raise_at (Input.mark t.input) Invalid_comment
···107111 (* Check for tabs used as indentation in block context *)
108112 (match Input.peek t.input with
109113 | Some '\t' when t.flow_level = 0 && t.leading_whitespace &&
110110- (column t - 1) <= current_indent t ->
114114+ (column t - 1) < current_indent t ->
111115 (* Tab found in indentation zone - this is invalid *)
112116 (* Skip to end of line to check if line has content *)
113117 let start_pos = Input.mark t.input in
···129133 skip_to_next_token t
130134 end
131135 else if t.flow_level > 0 && Input.next_is_whitespace t.input then begin
132132- ignore (Input.next t.input);
133133- skip_to_next_token t
136136+ (* In flow context, skip all whitespace including line breaks *)
137137+ if Input.next_is_break t.input then begin
138138+ Input.consume_break t.input;
139139+ (* Allow simple keys after line breaks in flow context *)
140140+ t.allow_simple_key <- true;
141141+ skip_to_next_token t
142142+ end else begin
143143+ ignore (Input.next t.input);
144144+ skip_to_next_token t
145145+ end
134146 end
135147136148(** Roll the indentation level *)
···253265254266(** Scan tag suffix (after handle) *)
255267let scan_tag_suffix t =
268268+ let is_hex_digit c =
269269+ (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
270270+ in
271271+ let hex_val c =
272272+ match c with
273273+ | '0'..'9' -> Char.code c - Char.code '0'
274274+ | 'A'..'F' -> Char.code c - Char.code 'A' + 10
275275+ | 'a'..'f' -> Char.code c - Char.code 'a' + 10
276276+ | _ -> 0
277277+ in
256278 let buf = Buffer.create 32 in
257279 while
258280 match Input.peek t.input with
281281+ | Some '%' ->
282282+ (* Percent-encoded character *)
283283+ ignore (Input.next t.input);
284284+ (match Input.peek t.input, Input.peek_nth t.input 1 with
285285+ | Some c1, Some c2 when is_hex_digit c1 && is_hex_digit c2 ->
286286+ ignore (Input.next t.input);
287287+ ignore (Input.next t.input);
288288+ let code = (hex_val c1) * 16 + (hex_val c2) in
289289+ Buffer.add_char buf (Char.chr code);
290290+ true
291291+ | _ ->
292292+ (* Invalid percent encoding - keep the % *)
293293+ Buffer.add_char buf '%';
294294+ true)
259295 | Some c when not (Input.is_whitespace c) &&
260296 not (Input.is_flow_indicator c) ->
261297 Buffer.add_char buf c;
···396432 let start = Input.mark t.input in
397433 ignore (Input.next t.input); (* consume opening double-quote *)
398434 let buf = Buffer.create 64 in
435435+ let whitespace = Buffer.create 16 in (* Track pending whitespace *)
436436+437437+ let flush_whitespace () =
438438+ if Buffer.length whitespace > 0 then begin
439439+ Buffer.add_buffer buf whitespace;
440440+ Buffer.clear whitespace
441441+ end
442442+ in
443443+399444 let rec loop () =
400445 match Input.peek t.input with
401446 | None -> Error.raise_at start Unclosed_double_quote
402447 | Some '"' ->
448448+ (* Flush trailing whitespace before closing quote to preserve it *)
449449+ flush_whitespace ();
403450 ignore (Input.next t.input)
451451+ | Some ' ' | Some '\t' as c_opt ->
452452+ (* Track whitespace - don't add to buf yet *)
453453+ let c = match c_opt with Some c -> c | None -> assert false in
454454+ Buffer.add_char whitespace c;
455455+ ignore (Input.next t.input);
456456+ loop ()
404457 | Some '\\' ->
458458+ (* Escape sequence - this is non-whitespace content *)
459459+ flush_whitespace (); (* Commit any pending whitespace *)
405460 ignore (Input.next t.input);
406461 (match Input.peek t.input with
407462 | None -> Error.raise_at start (Invalid_escape_sequence "\\<EOF>")
···432487 ignore (Input.next t.input);
433488 Buffer.add_string buf (decode_hex t 8)
434489 | Some '\n' | Some '\r' ->
435435- (* Line continuation *)
490490+ (* Line continuation escape *)
436491 Input.consume_break t.input;
437492 while Input.next_is_blank t.input do
438493 ignore (Input.next t.input)
···442497 (Invalid_escape_sequence (Printf.sprintf "\\%c" c)));
443498 loop ()
444499 | Some '\n' | Some '\r' ->
445445- (* Per YAML spec: discard trailing whitespace before line break *)
446446- let len = Buffer.length buf in
447447- let rec trim_end i =
448448- if i < 0 then 0
449449- else match Buffer.nth buf i with
450450- | ' ' | '\t' -> trim_end (i - 1)
451451- | _ -> i + 1
452452- in
453453- Buffer.truncate buf (trim_end (len - 1));
500500+ (* Line break: discard any pending trailing whitespace *)
501501+ Buffer.clear whitespace;
454502 Input.consume_break t.input;
455503 (* Count consecutive line breaks (empty lines) *)
456504 let empty_lines = ref 0 in
···481529 Buffer.add_char buf ' ';
482530 loop ()
483531 | Some c ->
532532+ (* Non-whitespace character *)
533533+ flush_whitespace (); (* Commit any pending whitespace *)
484534 Buffer.add_char buf c;
485535 ignore (Input.next t.input);
486536 loop ()
···516566 let start = Input.mark t.input in
517567 let in_flow = t.flow_level > 0 in
518568 let indent = current_indent t in
519519- (* Validate flow collection indentation *)
520520- if in_flow && (column t) < t.flow_indent then
569569+ (* In flow context, scalars must be indented more than the current block indent.
570570+ This ensures that content at block indent or less ends the flow context. *)
571571+ if in_flow && (column t - 1) < indent then
521572 Error.raise_at start Invalid_flow_indentation;
522573 let buf = Buffer.create 64 in
523574 let spaces = Buffer.create 16 in
575575+ let whitespace = Buffer.create 16 in (* Track whitespace within a line *)
524576 let leading_blanks = ref false in
525577526578 let rec scan_line () =
527579 match Input.peek t.input with
528580 | None -> ()
581581+ | Some c when Input.is_blank c && can_continue_plain t c ~in_flow ->
582582+ (* Blank character within a line - save to whitespace buffer *)
583583+ Buffer.add_char whitespace c;
584584+ ignore (Input.next t.input);
585585+ scan_line ()
529586 | Some c when can_continue_plain t c ~in_flow ->
530530- (* can_continue_plain already handles # correctly - it returns false
531531- when # is preceded by whitespace (making it a comment indicator) *)
587587+ (* Non-blank character - process any pending breaks/whitespace first *)
532588 begin
533589 if Buffer.length spaces > 0 then begin
534590 if !leading_blanks then begin
···544600 Buffer.add_buffer buf spaces;
545601 Buffer.clear spaces
546602 end;
603603+ (* Add any pending whitespace from within the line *)
604604+ if Buffer.length whitespace > 0 then begin
605605+ Buffer.add_buffer buf whitespace;
606606+ Buffer.clear whitespace
607607+ end;
608608+ (* Add the character *)
547609 Buffer.add_char buf c;
548610 ignore (Input.next t.input);
549611 leading_blanks := false;
···555617 let rec scan_lines () =
556618 scan_line ();
557619 (* Check for line continuation *)
558558- if not in_flow && Input.next_is_break t.input then begin
620620+ if Input.next_is_break t.input then begin
621621+ (* Discard any trailing whitespace from the current line *)
622622+ Buffer.clear whitespace;
559623 (* Save the line break *)
560624 if !leading_blanks then begin
561625 (* We already had a break - this is an additional break (empty line) *)
···567631 leading_blanks := true
568632 end;
569633 Input.consume_break t.input;
570570- (* Line break in block context allows simple key *)
571571- t.allow_simple_key <- true;
634634+ (* Line break allows simple key in both block and flow contexts *)
635635+ if in_flow then
636636+ t.allow_simple_key <- true;
637637+ if not in_flow then
638638+ t.allow_simple_key <- true;
572639 (* Skip leading blanks on the next line *)
573640 while Input.next_is_blank t.input do
574641 ignore (Input.next t.input)
575642 done;
576643 let col = (Input.position t.input).column in
577644 (* Check indentation - stop if we're at or before the containing block's indent *)
578578- if not in_flow && col <= indent then
579579- () (* Stop - dedented or at parent level *)
645645+ (* However, allow empty lines (line breaks) to continue even if dedented *)
646646+ if Input.next_is_break t.input then
647647+ scan_lines () (* Empty line - continue *)
648648+ else if not in_flow && col <= indent then
649649+ () (* Stop - dedented or at parent level in block context *)
580650 else if Input.at_document_boundary t.input then
581651 () (* Stop - document boundary *)
582652 else
···652722653723 let buf = Buffer.create 256 in
654724 let trailing_breaks = Buffer.create 16 in
725725+ let leading_blank = ref false in (* Was the previous line "more indented"? *)
655726656727 (* Skip to content indentation, skipping empty lines.
657728 Returns the number of spaces actually skipped (important for detecting dedentation). *)
···686757 (* Check if we found a break (empty line) or content *)
687758 (match Input.peek_nth t.input (!idx) with
688759 | None | Some '\n' | Some '\r' ->
689689- (* Empty line - consume all blanks and break *)
690690- while Input.next_is_blank t.input do
691691- ignore (Input.next t.input)
692692- done;
760760+ (* Empty line - preserve spaces for literal scalars *)
761761+ if literal then begin
762762+ while Input.next_is_blank t.input do
763763+ Buffer.add_char trailing_breaks ' ';
764764+ ignore (Input.next t.input)
765765+ done
766766+ end else begin
767767+ while Input.next_is_blank t.input do
768768+ ignore (Input.next t.input)
769769+ done
770770+ end;
693771 Buffer.add_char trailing_breaks '\n';
694772 Input.consume_break t.input;
695773 skip_to_content_indent ()
···748826 let line_indent = spaces_skipped + !extra_spaces in
749827750828 (* Determine content indent from first content line (implicit case) *)
829829+ let first_line = !content_indent = 0 in
751830 if !content_indent = 0 then begin
752831 if line_indent <= base_indent then begin
753832 (* No content - restore position conceptually *)
···760839 (* Dedented - done with content *)
761840 ()
762841 end else begin
842842+ (* Check if current line is "more indented" (has extra indent beyond content_indent) *)
843843+ let trailing_blank = line_indent > !content_indent in
844844+763845 (* Add trailing breaks to buffer *)
764846 if Buffer.length buf > 0 then begin
765847 if Buffer.length trailing_breaks > 0 then begin
766848 if literal then
767849 Buffer.add_buffer buf trailing_breaks
768850 else begin
769769- let breaks = Buffer.contents trailing_breaks in
770770- if String.length breaks = 1 then
771771- Buffer.add_char buf ' '
772772- else
773773- Buffer.add_substring buf breaks 1 (String.length breaks - 1)
851851+ (* Folded scalar: fold only if both previous and current lines are not more-indented *)
852852+ if not !leading_blank && not trailing_blank then begin
853853+ let breaks = Buffer.contents trailing_breaks in
854854+ if String.length breaks = 1 then
855855+ Buffer.add_char buf ' '
856856+ else
857857+ Buffer.add_substring buf breaks 1 (String.length breaks - 1)
858858+ end else begin
859859+ (* Preserve breaks for more-indented lines *)
860860+ Buffer.add_buffer buf trailing_breaks
861861+ end
774862 end
775863 end else if not literal then
776864 Buffer.add_char buf ' '
···778866 Buffer.add_buffer buf trailing_breaks;
779867 Buffer.clear trailing_breaks;
780868781781- (* Add extra indentation for literal *)
782782- if literal then begin
783783- for _ = !content_indent + 1 to line_indent do
869869+ (* Add extra indentation for literal or more-indented folded lines *)
870870+ (* On the first line (when determining content_indent), we've already consumed all spaces,
871871+ so we should NOT add any back. On subsequent lines, we add only the spaces beyond content_indent. *)
872872+ if not first_line && (literal || (!extra_spaces > 0 && not literal)) then begin
873873+ for _ = 1 to !extra_spaces do
784874 Buffer.add_char buf ' '
785875 done
786876 end;
···795885 Buffer.add_char trailing_breaks '\n';
796886 Input.consume_break t.input
797887 end;
888888+889889+ (* Update leading_blank for next iteration *)
890890+ leading_blank := trailing_blank;
798891799892 read_lines ()
800893 end
···840933 ignore (Input.next t.input)
841934 done;
842935843843- let span = Span.make ~start ~stop:(Input.mark t.input) in
844844-845936 match name with
846937 | "YAML" ->
847938 (* Version directive: %YAML 1.2 *)
···883974 let span = Span.make ~start ~stop:(Input.mark t.input) in
884975 Token.Tag_directive { handle; prefix }, span
885976886886- | _ when String.length name > 0 && name.[0] >= 'A' && name.[0] <= 'Z' ->
887887- (* Reserved directive *)
888888- Error.raise_span span (Reserved_directive name)
889889-890977 | _ ->
891891- (* Unknown directive - skip to end of line *)
978978+ (* Reserved/Unknown directive - skip to end of line and ignore *)
979979+ (* Per YAML spec, reserved directives should be ignored with a warning *)
892980 while not (Input.is_eof t.input) && not (Input.next_is_break t.input) do
893981 ignore (Input.next t.input)
894982 done;
895895- Error.raise_span span (Invalid_directive name)
983983+ let span = Span.make ~start ~stop:(Input.mark t.input) in
984984+ (* Return an empty tag directive token to indicate directive was processed but ignored *)
985985+ Token.Tag_directive { handle = ""; prefix = "" }, span
896986897987(** Fetch the next token(s) into the queue *)
898988let rec fetch_next_token t =
···9221012 | Some ',' -> fetch_flow_entry t
9231013 | Some '-' when t.flow_level = 0 && check_block_entry t ->
9241014 fetch_block_entry t
925925- | Some '?' when t.flow_level = 0 && check_key t ->
10151015+ | Some '?' when check_key t ->
9261016 fetch_key t
9271017 | Some ':' when check_value t ->
9281018 fetch_value t
···10041094 t.allow_simple_key <- false;
10051095 let start = Input.mark t.input in
10061096 ignore (Input.next t.input);
10971097+ (* Allow adjacent values after flow collection ends *)
10981098+ if t.flow_level > 0 then
10991099+ t.adjacent_value_allowed_at <- Some (Input.position t.input);
10071100 let span = Span.make ~start ~stop:(Input.mark t.input) in
10081101 emit t span token_type
10091102···1023111610241117and fetch_block_entry t =
10251118 if t.flow_level = 0 then begin
10261026- if not t.allow_simple_key then
10271027- Error.raise_at (Input.mark t.input) Expected_block_entry;
11191119+ (* Block entries don't require allow_simple_key to be true, because:
11201120+ 1. They're not simple keys themselves
11211121+ 2. They can appear after : on the same line (e.g., ": - a")
11221122+ So we only check allow_simple_key in contexts where it's truly required.
11231123+ For now, we allow block entries in block context. *)
10281124 let col = column t in
10291125 if roll_indent t col ~sequence:true then begin
10301126 let span = Span.point (Input.mark t.input) in
···10551151 emit t span Token.Block_entry
1056115210571153and check_key t =
10581058- (* ? followed by whitespace in block, any in flow *)
10591059- if t.flow_level > 0 then true
10601060- else match Input.peek_nth t.input 1 with
11541154+ (* ? followed by whitespace or flow indicator in both block and flow *)
11551155+ match Input.peek_nth t.input 1 with
10611156 | None -> true
10621062- | Some c -> Input.is_whitespace c
11571157+ | Some c ->
11581158+ Input.is_whitespace c ||
11591159+ (t.flow_level > 0 && Input.is_flow_indicator c)
1063116010641161and fetch_key t =
10651162 if t.flow_level = 0 then begin
···10851182 end;
1086118310871184 let span = Span.make ~start ~stop:(Input.mark t.input) in
10881088- emit t span Token.Key
11851185+ emit t span Token.Key;
11861186+ t.pending_value <- true (* We've emitted a KEY, now waiting for VALUE *)
1089118710901188and check_value t =
10911091- (* : followed by whitespace in block, or flow indicator in flow *)
10921092- if t.flow_level > 0 then true
10931093- else match Input.peek_nth t.input 1 with
10941094- | None -> true
10951095- | Some c -> Input.is_whitespace c
11891189+ (* : followed by whitespace in block, or whitespace/flow indicator in flow, or adjacent value *)
11901190+ match Input.peek_nth t.input 1 with
11911191+ | None -> true
11921192+ | Some c ->
11931193+ Input.is_whitespace c ||
11941194+ (t.flow_level > 0 && Input.is_flow_indicator c) ||
11951195+ (* Allow adjacent values in flow context at designated positions *)
11961196+ (t.flow_level > 0 &&
11971197+ match t.adjacent_value_allowed_at with
11981198+ | Some pos -> pos.Position.line = (Input.position t.input).Position.line &&
11991199+ pos.Position.column = (Input.position t.input).Position.column
12001200+ | None -> false)
1096120110971202and fetch_value t =
10981203 (* Check for simple key *)
···11121217 if insert_pos >= Array.length tokens then
11131218 Queue.add key_token t.tokens;
11141219 t.token_number <- t.token_number + 1;
12201220+ t.pending_value <- true; (* We've inserted a KEY token, now waiting for VALUE *)
11151221 (* Roll indent for implicit block mapping *)
11161222 if t.flow_level = 0 then begin
11171223 let col = sk.sk_position.column in
···11321238 end;
11331239 t.simple_keys <- None :: (List.tl t.simple_keys)
11341240 | _ ->
11351135- (* No simple key - this is a complex value *)
12411241+ (* No simple key - this is a complex value (or empty key) *)
11361242 if t.flow_level = 0 then begin
11371243 if not t.allow_simple_key then
11381244 Error.raise_at (Input.mark t.input) Expected_key;
···11401246 if roll_indent t col ~sequence:false then begin
11411247 let span = Span.point (Input.mark t.input) in
11421248 emit t span Token.Block_mapping_start
12491249+ end;
12501250+ (* Emit KEY token for empty key case (e.g., ": value") only if we don't already have a pending KEY *)
12511251+ if not t.pending_value then begin
12521252+ let span = Span.point (Input.mark t.input) in
12531253+ emit t span Token.Key;
12541254+ t.pending_value <- true
11431255 end
11441256 end);
11451257 remove_simple_key t;
···11611273 | _ -> ()
11621274 end;
1163127512761276+ (* Skip any comment that may follow the colon and whitespace *)
12771277+ skip_whitespace_and_comment t;
12781278+11641279 let span = Span.make ~start ~stop:(Input.mark t.input) in
11651165- emit t span Token.Value
12801280+ emit t span Token.Value;
12811281+ t.pending_value <- false (* We've emitted a VALUE, no longer pending *)
1166128211671283and fetch_alias t =
11681284 save_simple_key t;
···12031319 t.allow_simple_key <- false;
12041320 t.document_has_content <- true;
12051321 let value, span = scan_single_quoted t in
13221322+ (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
13231323+ skip_to_next_token t;
13241324+ if t.flow_level > 0 then
13251325+ t.adjacent_value_allowed_at <- Some (Input.position t.input);
12061326 emit t span (Token.Scalar { style = Scalar_style.Single_quoted; value })
1207132712081328and fetch_double_quoted t =
···12101330 t.allow_simple_key <- false;
12111331 t.document_has_content <- true;
12121332 let value, span = scan_double_quoted t in
13331333+ (* Allow adjacent values after quoted scalars in flow context (for JSON compatibility) *)
13341334+ skip_to_next_token t;
13351335+ if t.flow_level > 0 then
13361336+ t.adjacent_value_allowed_at <- Some (Input.position t.input);
12131337 emit t span (Token.Scalar { style = Scalar_style.Double_quoted; value })
1214133812151339and can_start_plain t =