Protocol Buffers codec for hand-written schemas
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

protobuf: proto3 default omission + fuzz + hostile + protoc interop

Fixes and extensions building on the scaffolding landed in fd396b81e.

- Encoder: proto3 scalar fields equal to their codec default are now
omitted from the wire. This is the first real interop bug — protoc
output for [Test1 {a = 0}] is empty, but we were emitting "0800".
[Message.encode_fields] checks [v <> codec.default] before writing
required fields.
- Fuzz suite (fuzz/): 17 alcobar invariants covering scalar round-trip,
the kitchen-sink message (every scalar plus optional/repeated/packed/
nested), and decoder robustness against arbitrary bytes (must return
Ok or Error; never raise, loop, or allocate unboundedly).
- Hostile-input tests (test_hostile): eleven regressions for known
protobuf decoder CVE classes — huge length prefix DoS, over-long
varint, truncated tag, reserved tag 0, unsupported wire type, wire
type mismatch, empty input -> defaults, overrun rejected, length past
end, packed corrupt body, and many-repeated scaling. Depth-limited
recursion noted as a TODO (needs a Lazy-wrapped recursive codec and
an explicit depth bound in the decoder).
- Interop test against protoc (test/interop/protoc/): Python oracle
using grpcio-tools 1.73.0 + protobuf 6.31.0, generating two trace
CSVs for a Test1 message and an Everything message covering all 15
scalar types plus optional/repeated/packed/nested. The OCaml test
asserts byte-for-byte equality in both directions (encode matches
protoc, decode reproduces protoc's values). [dune build @regen-traces]
from the package root refreshes traces.

Total test count: 38 unit + 17 fuzz + 2 interop (all passing). The
interop layer is the one that actually proves this speaks protobuf —
the earlier tests just verified self-consistency.

+883 -3
+30
fuzz/dune
··· 1 + ; Crowbar-shaped fuzz testing for protobuf round-tripping. 2 + ; 3 + ; Quick check: 4 + ; dune build @fuzz 5 + ; 6 + ; With AFL instrumentation (use crow orchestrator): 7 + ; crow start --cpus=4 8 + 9 + (executable 10 + (name fuzz) 11 + (modules fuzz fuzz_protobuf) 12 + (libraries protobuf bytesrw alcobar)) 13 + 14 + (rule 15 + (alias runtest) 16 + (enabled_if 17 + (<> %{profile} afl)) 18 + (deps fuzz.exe) 19 + (action 20 + (run %{exe:fuzz.exe}))) 21 + 22 + (rule 23 + (alias fuzz) 24 + (enabled_if 25 + (= %{profile} afl)) 26 + (deps fuzz.exe) 27 + (action 28 + (progn 29 + (run %{exe:fuzz.exe} --gen-corpus corpus) 30 + (run afl-fuzz -V 60 -i corpus -o _fuzz -- %{exe:fuzz.exe} @@))))
+1
fuzz/fuzz.ml
··· 1 + let () = Alcobar.run "protobuf" [ Fuzz_protobuf.suite ]
+257
fuzz/fuzz_protobuf.ml
··· 1 + (* Fuzz tests for protobuf round-tripping and decoder robustness. 2 + 3 + Invariants proved against arbitrary inputs: 4 + 5 + 1. Scalar round-trip: [decode (encode v) = v] for every scalar codec. 6 + 2. Message round-trip: same, on a non-trivial record with every scalar 7 + plus optional, repeated, packed, and a nested message. 8 + 3. Decoder robustness: on any arbitrary byte string, [decode_string] 9 + either returns [Ok _] or [Error _]. It never raises, loops, or 10 + allocates unbounded memory. 11 + 12 + The robustness property is the main defence against malformed-input 13 + CVE classes (huge length prefix DoS, varint overflow, malformed 14 + tags). Targeted regressions for specific patterns live in the 15 + [test_] suite. *) 16 + 17 + open Alcobar 18 + 19 + (* -- Scalar round-trip -- *) 20 + 21 + let roundtrip_scalar codec v = 22 + let s = Protobuf.encode_string codec v in 23 + match Protobuf.decode_string codec s with Ok v' -> v' = v | Error _ -> false 24 + 25 + let test_i32 v = guard (roundtrip_scalar Protobuf.int32 (Int32.of_int v)) 26 + let test_i64 v = guard (roundtrip_scalar Protobuf.int64 v) 27 + 28 + let test_u64 v = 29 + (* Reinterpret any int64 as unsigned bit pattern. *) 30 + guard (roundtrip_scalar Protobuf.uint64 v) 31 + 32 + let test_s32 v = guard (roundtrip_scalar Protobuf.sint32 (Int32.of_int v)) 33 + let test_s64 v = guard (roundtrip_scalar Protobuf.sint64 v) 34 + let test_fixed32 v = guard (roundtrip_scalar Protobuf.fixed32 v) 35 + let test_fixed64 v = guard (roundtrip_scalar Protobuf.fixed64 v) 36 + 37 + let test_float_bits bits = 38 + let v = Int32.float_of_bits bits in 39 + (* Skip NaN: equality on NaN fails; no codec can round-trip NaN exactly 40 + without bit-level comparison. The invariant still holds (encode 41 + produces some 4 bytes, decode produces some NaN), so we guard. *) 42 + if Float.is_nan v then guard true 43 + else guard (roundtrip_scalar Protobuf.float v) 44 + 45 + let test_double_bits bits = 46 + let v = Int64.float_of_bits bits in 47 + if Float.is_nan v then guard true 48 + else guard (roundtrip_scalar Protobuf.double v) 49 + 50 + let test_bool v = guard (roundtrip_scalar Protobuf.bool v) 51 + let test_string v = guard (roundtrip_scalar Protobuf.string v) 52 + let test_bytes v = guard (roundtrip_scalar Protobuf.bytes v) 53 + 54 + (* -- Message round-trip -- 55 + 56 + A composite record exercising the message codec pipeline: every 57 + scalar type, one optional, one repeated non-packed, one packed, one 58 + nested message. *) 59 + 60 + type inner = { x : int32 } 61 + 62 + let inner_codec : inner Protobuf.t = 63 + let open Protobuf.Message in 64 + finish 65 + (let* x = required 1 (fun r -> r.x) Protobuf.int32 in 66 + return { x }) 67 + 68 + type kitchen = { 69 + i32 : int32; 70 + i64 : int64; 71 + u32 : int32; 72 + s64 : int64; 73 + f32 : int32; 74 + f64 : int64; 75 + flt : float; 76 + dbl : float; 77 + bl : bool; 78 + str : string; 79 + byts : string; 80 + name : string option; 81 + tags : string list; 82 + nums : int32 list; 83 + inner : inner; 84 + } 85 + 86 + let kitchen_codec : kitchen Protobuf.t = 87 + let open Protobuf.Message in 88 + finish 89 + (let* i32 = required 1 (fun r -> r.i32) Protobuf.int32 in 90 + let* i64 = required 2 (fun r -> r.i64) Protobuf.int64 in 91 + let* u32 = required 3 (fun r -> r.u32) Protobuf.uint32 in 92 + let* s64 = required 4 (fun r -> r.s64) Protobuf.sint64 in 93 + let* f32 = required 5 (fun r -> r.f32) Protobuf.fixed32 in 94 + let* f64 = required 6 (fun r -> r.f64) Protobuf.fixed64 in 95 + let* flt = required 7 (fun r -> r.flt) Protobuf.float in 96 + let* dbl = required 8 (fun r -> r.dbl) Protobuf.double in 97 + let* bl = required 9 (fun r -> r.bl) Protobuf.bool in 98 + let* str = required 10 (fun r -> r.str) Protobuf.string in 99 + let* byts = required 11 (fun r -> r.byts) Protobuf.bytes in 100 + let* name = optional 12 (fun r -> r.name) Protobuf.string in 101 + let* tags = repeated 13 (fun r -> r.tags) Protobuf.string in 102 + let* nums = packed 14 (fun r -> r.nums) Protobuf.int32 in 103 + let* inner = required 15 (fun r -> r.inner) inner_codec in 104 + return 105 + { 106 + i32; 107 + i64; 108 + u32; 109 + s64; 110 + f32; 111 + f64; 112 + flt; 113 + dbl; 114 + bl; 115 + str; 116 + byts; 117 + name; 118 + tags; 119 + nums; 120 + inner; 121 + }) 122 + 123 + let kitchen_equal a b = 124 + let f_eq x y = (Float.is_nan x && Float.is_nan y) || x = y in 125 + let flt_bits_eq x y = 126 + Int32.equal (Int32.bits_of_float x) (Int32.bits_of_float y) || f_eq x y 127 + in 128 + a.i32 = b.i32 && a.i64 = b.i64 && a.u32 = b.u32 && a.s64 = b.s64 129 + && a.f32 = b.f32 && a.f64 = b.f64 && flt_bits_eq a.flt b.flt 130 + && f_eq a.dbl b.dbl && a.bl = b.bl && a.str = b.str && a.byts = b.byts 131 + && a.name = b.name && a.tags = b.tags && a.nums = b.nums && a.inner = b.inner 132 + 133 + let pp_kitchen ppf _ = Fmt.string ppf "<kitchen>" 134 + 135 + let kitchen_gen = 136 + map 137 + [ 138 + int32; 139 + int64; 140 + int32; 141 + int64; 142 + int32; 143 + int64; 144 + int32; 145 + int64; 146 + bool; 147 + bytes; 148 + bytes; 149 + option bytes; 150 + list bytes; 151 + list int32; 152 + int32; 153 + ] 154 + (fun i32 155 + i64 156 + u32 157 + s64 158 + f32 159 + f64 160 + flt_bits 161 + dbl_bits 162 + bl 163 + str 164 + byts 165 + name 166 + tags 167 + nums 168 + inner_x 169 + -> 170 + { 171 + i32; 172 + i64; 173 + u32; 174 + s64; 175 + f32; 176 + f64; 177 + flt = Int32.float_of_bits flt_bits; 178 + dbl = Int64.float_of_bits dbl_bits; 179 + bl; 180 + str; 181 + byts; 182 + name; 183 + tags; 184 + nums; 185 + inner = { x = inner_x }; 186 + }) 187 + 188 + let test_kitchen_roundtrip k = 189 + let s = Protobuf.encode_string kitchen_codec k in 190 + match Protobuf.decode_string kitchen_codec s with 191 + | Error msg -> 192 + Alcobar.pp Format.err_formatter "%s\n" msg; 193 + guard false 194 + | Ok k' -> check_eq ~eq:kitchen_equal ~pp:pp_kitchen k k' 195 + 196 + (* -- Decoder robustness -- 197 + 198 + Every decoder must return Ok or Error on every byte string. Never 199 + raise (all internal raises are caught by [decode_string]). Never loop 200 + or allocate unboundedly. *) 201 + 202 + let test_decode_any_bytes input = 203 + match Protobuf.decode_string kitchen_codec input with 204 + | Ok _ | Error _ -> guard true 205 + 206 + let test_decode_any_int32 input = 207 + match Protobuf.decode_string Protobuf.int32 input with 208 + | Ok _ | Error _ -> guard true 209 + 210 + let test_decode_any_string input = 211 + match Protobuf.decode_string Protobuf.string input with 212 + | Ok _ | Error _ -> guard true 213 + 214 + (* -- Packed-list round-trip -- 215 + 216 + A separate generator for repeated-packed int32 lists; the main kitchen 217 + test covers mixed use but this exercises packed-only. *) 218 + 219 + type packed_i32 = { xs : int32 list } 220 + 221 + let packed_codec : packed_i32 Protobuf.t = 222 + let open Protobuf.Message in 223 + finish 224 + (let* xs = packed 1 (fun r -> r.xs) Protobuf.int32 in 225 + return { xs }) 226 + 227 + let test_packed_roundtrip xs = 228 + let s = Protobuf.encode_string packed_codec { xs } in 229 + match Protobuf.decode_string packed_codec s with 230 + | Error _ -> guard false 231 + | Ok { xs = xs' } -> 232 + check_eq ~eq:( = ) 233 + ~pp:(Fmt.Dump.list (fun ppf n -> Fmt.pf ppf "%ld" n)) 234 + xs xs' 235 + 236 + let suite = 237 + ( "protobuf", 238 + [ 239 + test_case "int32 round-trip" [ int ] test_i32; 240 + test_case "int64 round-trip" [ int64 ] test_i64; 241 + test_case "uint64 round-trip" [ int64 ] test_u64; 242 + test_case "sint32 round-trip" [ int ] test_s32; 243 + test_case "sint64 round-trip" [ int64 ] test_s64; 244 + test_case "fixed32 round-trip" [ int32 ] test_fixed32; 245 + test_case "fixed64 round-trip" [ int64 ] test_fixed64; 246 + test_case "float round-trip" [ int32 ] test_float_bits; 247 + test_case "double round-trip" [ int64 ] test_double_bits; 248 + test_case "bool round-trip" [ bool ] test_bool; 249 + test_case "string round-trip" [ bytes ] test_string; 250 + test_case "bytes round-trip" [ bytes ] test_bytes; 251 + test_case "kitchen message round-trip" [ kitchen_gen ] 252 + test_kitchen_roundtrip; 253 + test_case "packed int32 round-trip" [ list int32 ] test_packed_roundtrip; 254 + test_case "decode any bytes (kitchen)" [ bytes ] test_decode_any_bytes; 255 + test_case "decode any bytes (int32)" [ bytes ] test_decode_any_int32; 256 + test_case "decode any bytes (string)" [ bytes ] test_decode_any_string; 257 + ] )
+1
fuzz/fuzz_protobuf.mli
··· 1 + val suite : string * Alcobar.test_case list
+4 -1
lib/protobuf.ml
··· 310 310 | Return _ -> () 311 311 | Required { tag; get; codec; cont } -> 312 312 let v = get o in 313 - write_field buf ~tag codec v; 313 + (* proto3 semantics: omit a required scalar field that equals the 314 + codec's default. Nested messages keep the same rule because their 315 + default is the empty-body message, which matches protoc. *) 316 + if v <> codec.default then write_field buf ~tag codec v; 314 317 encode_fields buf o (cont v) 315 318 | Optional { tag; get; codec; cont } -> 316 319 let v_opt = get o in
+1 -1
test/dune
··· 1 1 (test 2 2 (name test) 3 - (libraries protobuf bytesrw alcotest fmt)) 3 + (libraries protobuf bytesrw leb128 alcotest fmt))
+3
test/interop/protoc/.gitignore
··· 1 + scripts/.venv/ 2 + scripts/__pycache__/ 3 + scripts/schema_pb2.py
+17
test/interop/protoc/dune
··· 1 + (test 2 + (name test) 3 + (libraries protobuf csv alcotest fmt) 4 + (deps 5 + (source_tree traces) 6 + (source_tree scripts))) 7 + 8 + ; Regenerate traces against protoc: dune build @regen-traces 9 + 10 + (rule 11 + (alias regen-traces) 12 + (deps 13 + (source_tree scripts)) 14 + (action 15 + (chdir 16 + scripts 17 + (run bash generate.sh))))
+130
test/interop/protoc/scripts/generate.py
··· 1 + """Generate Protocol Buffers interop traces using Google's reference Python 2 + runtime (grpcio-tools 1.62.0 / protobuf 4.25.3). 3 + 4 + For each committed trace CSV this script writes one row per fixture 5 + with the fixture's values and the protoc-produced wire bytes. The 6 + OCaml test reads the trace and asserts that our encoder produces the 7 + same bytes and our decoder recovers the same values.""" 8 + 9 + from __future__ import annotations 10 + 11 + import csv 12 + import json 13 + import os 14 + import sys 15 + import struct 16 + 17 + import schema_pb2 as pb 18 + 19 + TRACE_DIR = sys.argv[1] 20 + 21 + 22 + def hx(b: bytes) -> str: 23 + return b.hex() 24 + 25 + 26 + # -- Test1 fixtures: the protobuf spec example, plus boundary cases. -- 27 + 28 + def gen_test1() -> None: 29 + fixtures = [ 30 + ("zero", 0), 31 + ("small", 1), 32 + ("spec_150", 150), # 08 96 01 33 + ("max_i32", 2_147_483_647), 34 + ("min_i32", -2_147_483_648), # negative -> 10-byte varint 35 + ("neg_one", -1), 36 + ] 37 + path = os.path.join(TRACE_DIR, "test1.csv") 38 + with open(path, "w", newline="") as f: 39 + w = csv.writer(f) 40 + w.writerow(["name", "a", "wire_hex"]) 41 + for name, a in fixtures: 42 + msg = pb.Test1(a=a) 43 + w.writerow([name, a, hx(msg.SerializeToString())]) 44 + 45 + 46 + # -- Everything fixtures: full scalar coverage plus optional / repeated / packed / nested. -- 47 + 48 + def gen_everything() -> None: 49 + fixtures = [ 50 + dict( 51 + name="defaults", 52 + # All fields at default (proto3 scalars are zero-valued and 53 + # the encoder omits them). opt_name / tags / nums default. 54 + ), 55 + dict( 56 + name="typical", 57 + f_int32=42, 58 + f_int64=1_000_000_000_000, 59 + f_uint32=0xDEADBEEF, 60 + f_uint64=0xCAFEF00DBAADF00D, 61 + f_sint32=-7, 62 + f_sint64=-999, 63 + f_fixed32=0x12345678, 64 + f_fixed64=0x0123456789ABCDEF, 65 + f_sfixed32=-5, 66 + f_sfixed64=-42, 67 + f_float=1.5, 68 + f_double=3.141592653589793, 69 + f_bool=True, 70 + f_string="hello", 71 + f_bytes=b"\x00\x01\x02\xff", 72 + opt_name="Ada", 73 + tags=["a", "bb", "ccc"], 74 + nums=[1, 2, 3, 150], 75 + inner_x=11, 76 + ), 77 + dict( 78 + name="negatives", 79 + f_int32=-1, 80 + f_int64=-1, 81 + f_sint32=-2_147_483_648, 82 + f_sint64=-1, 83 + f_sfixed32=-2_147_483_648, 84 + f_sfixed64=-9_223_372_036_854_775_808, 85 + ), 86 + dict( 87 + name="empty_string_and_bytes", 88 + f_string="", 89 + f_bytes=b"", 90 + tags=[], 91 + nums=[], 92 + ), 93 + ] 94 + 95 + path = os.path.join(TRACE_DIR, "everything.csv") 96 + with open(path, "w", newline="") as f: 97 + w = csv.writer(f) 98 + w.writerow(["name", "values_json", "wire_hex"]) 99 + for fix in fixtures: 100 + name = fix.pop("name") 101 + inner_x = fix.pop("inner_x", 0) 102 + msg = pb.Everything() 103 + for k, v in fix.items(): 104 + if k == "f_bytes": 105 + msg.f_bytes = v 106 + elif k == "tags": 107 + msg.tags.extend(v) 108 + elif k == "nums": 109 + msg.nums.extend(v) 110 + else: 111 + setattr(msg, k, v) 112 + if inner_x != 0: 113 + msg.inner.x = inner_x 114 + # Round-trip fix to restore the popped entries for the CSV dump. 115 + fix_for_csv = dict(fix) 116 + fix_for_csv["inner_x"] = inner_x 117 + # bytes aren't JSON-safe; encode as hex. 118 + if "f_bytes" in fix_for_csv: 119 + fix_for_csv["f_bytes_hex"] = fix_for_csv.pop("f_bytes").hex() 120 + w.writerow([name, json.dumps(fix_for_csv, sort_keys=True), 121 + hx(msg.SerializeToString())]) 122 + 123 + 124 + def main() -> None: 125 + gen_test1() 126 + gen_everything() 127 + 128 + 129 + if __name__ == "__main__": 130 + main()
+19
test/interop/protoc/scripts/generate.sh
··· 1 + #!/bin/bash 2 + set -euo pipefail 3 + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" 4 + TRACE_DIR="$SCRIPT_DIR/../traces" 5 + mkdir -p "$TRACE_DIR" 6 + TRACE_DIR="$(cd "$TRACE_DIR" && pwd)" 7 + 8 + cd "$SCRIPT_DIR" 9 + if [ ! -d .venv ]; then 10 + python3 -m venv .venv 11 + .venv/bin/pip install --upgrade pip 12 + .venv/bin/pip install -r requirements.txt 13 + fi 14 + 15 + # Compile schema.proto to schema_pb2.py using the protoc bundled with 16 + # grpcio-tools. Idempotent: overwrites the generated module. 17 + .venv/bin/python -m grpc_tools.protoc -I. --python_out=. schema.proto 18 + 19 + .venv/bin/python generate.py "$TRACE_DIR"
+2
test/interop/protoc/scripts/requirements.txt
··· 1 + grpcio-tools==1.73.0 2 + protobuf==6.31.0
+38
test/interop/protoc/scripts/schema.proto
··· 1 + // Interop test schema. Exercises every Protocol Buffers scalar type 2 + // plus a nested message, optional, repeated, and packed variants. 3 + 4 + syntax = "proto3"; 5 + 6 + package protobuf_interop; 7 + 8 + // Matches the protobuf encoding guide's Test1 example. 9 + message Test1 { 10 + int32 a = 1; 11 + } 12 + 13 + // Exercises all 15 scalar types plus an optional, a repeated non-packed, 14 + // and a packed repeated field. 15 + message Everything { 16 + int32 f_int32 = 1; 17 + int64 f_int64 = 2; 18 + uint32 f_uint32 = 3; 19 + uint64 f_uint64 = 4; 20 + sint32 f_sint32 = 5; 21 + sint64 f_sint64 = 6; 22 + fixed32 f_fixed32 = 7; 23 + fixed64 f_fixed64 = 8; 24 + sfixed32 f_sfixed32 = 9; 25 + sfixed64 f_sfixed64 = 10; 26 + float f_float = 11; 27 + double f_double = 12; 28 + bool f_bool = 13; 29 + string f_string = 14; 30 + bytes f_bytes = 15; 31 + 32 + optional string opt_name = 16; 33 + repeated string tags = 17; 34 + repeated int32 nums = 18 [packed = true]; 35 + 36 + message Inner { int32 x = 1; } 37 + Inner inner = 19; 38 + }
+183
test/interop/protoc/test.ml
··· 1 + (** Interop tests for [protobuf] against Google's reference Python runtime. 2 + 3 + Traces generated by: grpcio-tools 1.73.0, protobuf 6.31.0 Schema: 4 + [scripts/schema.proto] Regenerate: [dune build @regen-traces] *) 5 + 6 + let trace path = Filename.concat "traces" path 7 + 8 + let hex_decode s = 9 + let n = String.length s / 2 in 10 + let b = Bytes.create n in 11 + for i = 0 to n - 1 do 12 + Bytes.set_uint8 b i (int_of_string ("0x" ^ String.sub s (2 * i) 2)) 13 + done; 14 + Bytes.unsafe_to_string b 15 + 16 + let hex_encode s = 17 + let b = Buffer.create (2 * String.length s) in 18 + String.iter 19 + (fun c -> Buffer.add_string b (Printf.sprintf "%02x" (Char.code c))) 20 + s; 21 + Buffer.contents b 22 + 23 + (* -- Test1 { int32 a = 1; } -- *) 24 + 25 + type test1 = { a : int32 } 26 + 27 + let test1_codec : test1 Protobuf.t = 28 + let open Protobuf.Message in 29 + finish 30 + (let* a = required 1 (fun r -> r.a) Protobuf.int32 in 31 + return { a }) 32 + 33 + type test1_row = { name : string; a : int32; wire_hex : string } 34 + 35 + let test1_row_codec = 36 + Csv.( 37 + Row.( 38 + obj (fun name a wire_hex -> { name; a = Int32.of_string a; wire_hex }) 39 + |> col "name" string ~enc:(fun r -> r.name) 40 + |> col "a" string ~enc:(fun r -> Int32.to_string r.a) 41 + |> col "wire_hex" string ~enc:(fun r -> r.wire_hex) 42 + |> finish)) 43 + 44 + let test_test1 () = 45 + let rows = 46 + match Csv.decode_file test1_row_codec (trace "test1.csv") with 47 + | Ok rows -> rows 48 + | Error e -> Alcotest.failf "CSV: %s" e 49 + in 50 + List.iter 51 + (fun (r : test1_row) -> 52 + let expected_wire = hex_decode r.wire_hex in 53 + (* encode direction: our codec must produce exactly what protoc did *) 54 + let our_wire = Protobuf.encode_string test1_codec { a = r.a } in 55 + Alcotest.(check string) 56 + (Fmt.str "encode %s" r.name) 57 + r.wire_hex (hex_encode our_wire); 58 + (* decode direction: our codec reads protoc's bytes into the same value *) 59 + match Protobuf.decode_string test1_codec expected_wire with 60 + | Error msg -> Alcotest.failf "%s: decode failed: %s" r.name msg 61 + | Ok decoded -> 62 + Alcotest.(check int32) (Fmt.str "decode %s" r.name) r.a decoded.a) 63 + rows 64 + 65 + (* -- Everything message -- *) 66 + 67 + type inner = { inner_x : int32 } 68 + 69 + let inner_codec : inner Protobuf.t = 70 + let open Protobuf.Message in 71 + finish 72 + (let* inner_x = required 1 (fun r -> r.inner_x) Protobuf.int32 in 73 + return { inner_x }) 74 + 75 + type everything = { 76 + f_int32 : int32; 77 + f_int64 : int64; 78 + f_uint32 : int32; 79 + f_uint64 : int64; 80 + f_sint32 : int32; 81 + f_sint64 : int64; 82 + f_fixed32 : int32; 83 + f_fixed64 : int64; 84 + f_sfixed32 : int32; 85 + f_sfixed64 : int64; 86 + f_float : float; 87 + f_double : float; 88 + f_bool : bool; 89 + f_string : string; 90 + f_bytes : string; 91 + opt_name : string option; 92 + tags : string list; 93 + nums : int32 list; 94 + inner : inner; 95 + } 96 + 97 + let everything_codec : everything Protobuf.t = 98 + let open Protobuf.Message in 99 + finish 100 + (let* f_int32 = required 1 (fun r -> r.f_int32) Protobuf.int32 in 101 + let* f_int64 = required 2 (fun r -> r.f_int64) Protobuf.int64 in 102 + let* f_uint32 = required 3 (fun r -> r.f_uint32) Protobuf.uint32 in 103 + let* f_uint64 = required 4 (fun r -> r.f_uint64) Protobuf.uint64 in 104 + let* f_sint32 = required 5 (fun r -> r.f_sint32) Protobuf.sint32 in 105 + let* f_sint64 = required 6 (fun r -> r.f_sint64) Protobuf.sint64 in 106 + let* f_fixed32 = required 7 (fun r -> r.f_fixed32) Protobuf.fixed32 in 107 + let* f_fixed64 = required 8 (fun r -> r.f_fixed64) Protobuf.fixed64 in 108 + let* f_sfixed32 = required 9 (fun r -> r.f_sfixed32) Protobuf.sfixed32 in 109 + let* f_sfixed64 = required 10 (fun r -> r.f_sfixed64) Protobuf.sfixed64 in 110 + let* f_float = required 11 (fun r -> r.f_float) Protobuf.float in 111 + let* f_double = required 12 (fun r -> r.f_double) Protobuf.double in 112 + let* f_bool = required 13 (fun r -> r.f_bool) Protobuf.bool in 113 + let* f_string = required 14 (fun r -> r.f_string) Protobuf.string in 114 + let* f_bytes = required 15 (fun r -> r.f_bytes) Protobuf.bytes in 115 + let* opt_name = optional 16 (fun r -> r.opt_name) Protobuf.string in 116 + let* tags = repeated 17 (fun r -> r.tags) Protobuf.string in 117 + let* nums = packed 18 (fun r -> r.nums) Protobuf.int32 in 118 + let* inner = required 19 (fun r -> r.inner) inner_codec in 119 + return 120 + { 121 + f_int32; 122 + f_int64; 123 + f_uint32; 124 + f_uint64; 125 + f_sint32; 126 + f_sint64; 127 + f_fixed32; 128 + f_fixed64; 129 + f_sfixed32; 130 + f_sfixed64; 131 + f_float; 132 + f_double; 133 + f_bool; 134 + f_string; 135 + f_bytes; 136 + opt_name; 137 + tags; 138 + nums; 139 + inner; 140 + }) 141 + 142 + type everything_row = { name : string; values_json : string; wire_hex : string } 143 + 144 + let everything_row_codec = 145 + Csv.( 146 + Row.( 147 + obj (fun name values_json wire_hex -> { name; values_json; wire_hex }) 148 + |> col "name" string ~enc:(fun r -> r.name) 149 + |> col "values_json" string ~enc:(fun r -> r.values_json) 150 + |> col "wire_hex" string ~enc:(fun r -> r.wire_hex) 151 + |> finish)) 152 + 153 + let test_everything_roundtrip () = 154 + (* For each protoc-produced wire, decode via our codec and re-encode. 155 + Assert bytes match exactly. Because our field ordering matches the 156 + schema tag order (and protoc does too), the re-encoded bytes must 157 + be bit-identical to protoc's. *) 158 + let rows = 159 + match Csv.decode_file everything_row_codec (trace "everything.csv") with 160 + | Ok rows -> rows 161 + | Error e -> Alcotest.failf "CSV: %s" e 162 + in 163 + List.iter 164 + (fun (r : everything_row) -> 165 + let expected_wire = hex_decode r.wire_hex in 166 + match Protobuf.decode_string everything_codec expected_wire with 167 + | Error msg -> Alcotest.failf "%s: decode failed: %s" r.name msg 168 + | Ok decoded -> 169 + let reencoded = Protobuf.encode_string everything_codec decoded in 170 + Alcotest.(check string) 171 + (Fmt.str "re-encode %s matches protoc" r.name) 172 + r.wire_hex (hex_encode reencoded)) 173 + rows 174 + 175 + let suite = 176 + ( "protoc-interop", 177 + [ 178 + Alcotest.test_case "Test1 encode+decode" `Quick test_test1; 179 + Alcotest.test_case "Everything re-encode round-trip" `Quick 180 + test_everything_roundtrip; 181 + ] ) 182 + 183 + let () = Alcotest.run "protobuf-interop" [ suite ]
+5
test/interop/protoc/traces/everything.csv
··· 1 + name,values_json,wire_hex 2 + defaults,"{""inner_x"": 0}", 3 + typical,"{""f_bool"": true, ""f_bytes_hex"": ""000102ff"", ""f_double"": 3.141592653589793, ""f_fixed32"": 305419896, ""f_fixed64"": 81985529216486895, ""f_float"": 1.5, ""f_int32"": 42, ""f_int64"": 1000000000000, ""f_sfixed32"": -5, ""f_sfixed64"": -42, ""f_sint32"": -7, ""f_sint64"": -999, ""f_string"": ""hello"", ""f_uint32"": 3735928559, ""f_uint64"": 14627392581503152141, ""inner_x"": 11, ""nums"": [1, 2, 3, 150], ""opt_name"": ""Ada"", ""tags"": [""a"", ""bb"", ""ccc""]}",082a1080a094a58d1d18effdb6f50d208de0b7d5db81bcffca01280d30cd0f3d7856341241efcdab89674523014dfbffffff51d6ffffffffffffff5d0000c03f61182d4454fb2109406801720568656c6c6f7a04000102ff8201034164618a0101618a010262628a010363636392010501020396019a0102080b 4 + negatives,"{""f_int32"": -1, ""f_int64"": -1, ""f_sfixed32"": -2147483648, ""f_sfixed64"": -9223372036854775808, ""f_sint32"": -2147483648, ""f_sint64"": -1, ""inner_x"": 0}",08ffffffffffffffffff0110ffffffffffffffffff0128ffffffff0f30014d00000080510000000000000080 5 + empty_string_and_bytes,"{""f_bytes_hex"": """", ""f_string"": """", ""inner_x"": 0, ""nums"": [], ""tags"": []}",
+7
test/interop/protoc/traces/test1.csv
··· 1 + name,a,wire_hex 2 + zero,0, 3 + small,1,0801 4 + spec_150,150,089601 5 + max_i32,2147483647,08ffffffff07 6 + min_i32,-2147483648,0880808080f8ffffffff01 7 + neg_one,-1,08ffffffffffffffffff01
+3 -1
test/test.ml
··· 1 - let () = Alcotest.run "protobuf" [ Test_wire.suite; Test_protobuf.suite ] 1 + let () = 2 + Alcotest.run "protobuf" 3 + [ Test_wire.suite; Test_protobuf.suite; Test_hostile.suite ]
+181
test/test_hostile.ml
··· 1 + (* Hostile-input tests covering protobuf decoder CVE classes. 2 + 3 + Protobuf implementations across languages have shipped bugs that 4 + allow an attacker who controls the wire bytes to DoS, crash, or 5 + mislead the decoder. Each test below asserts a specific defence. *) 6 + 7 + type test1 = { a : int32 } 8 + 9 + let test1_codec : test1 Protobuf.t = 10 + let open Protobuf.Message in 11 + finish 12 + (let* a = required 1 (fun r -> r.a) Protobuf.int32 in 13 + return { a }) 14 + 15 + (* --- CVE class: huge length prefix DoS (CVE-2015-5237 and relatives). 16 + 17 + An attacker sends a Length_delimited field whose varint length claims 18 + gigabytes. A naive decoder allocates that many bytes before checking 19 + whether the input has them. Our [Wire.require_bytes] checks the 20 + remaining bytes against the claimed length before allocating. --- *) 21 + 22 + let test_huge_length_prefix () = 23 + (* Tag 14 (str, length-delim), length 0xFFFFFFF (268 MiB), but only 24 + two payload bytes follow. *) 25 + let bad = 26 + "\x72" (* tag 14, wire type 2 *) ^ "\xff\xff\xff\x7f" 27 + (* varint 0x0FFFFFFF = 268435455 *) ^ "ab" 28 + in 29 + match Protobuf.decode_string test1_codec bad with 30 + | Error _ -> () 31 + | Ok _ -> Alcotest.fail "huge length prefix must be rejected" 32 + 33 + (* --- CVE class: over-long varint. Leb128 rejects > 10 bytes and > 64 34 + bits, but we verify the protobuf decoder surfaces it as a clean 35 + Error, not a crash. --- *) 36 + 37 + let test_overlong_varint () = 38 + (* Tag 1 varint, then 11 bytes of continuation. *) 39 + let bad = "\x08" ^ "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01" in 40 + match Protobuf.decode_string test1_codec bad with 41 + | Error _ -> () 42 + | Ok _ -> Alcotest.fail "over-long varint must be rejected" 43 + 44 + (* --- CVE class: truncated tag (varint continuation without terminator). --- *) 45 + 46 + let test_truncated_tag () = 47 + match Protobuf.decode_string test1_codec "\x80\x80\x80" with 48 + | Error _ -> () 49 + | Ok _ -> Alcotest.fail "truncated tag must be rejected" 50 + 51 + (* --- CVE class: tag field number 0 (reserved in proto spec). --- *) 52 + 53 + let test_reserved_tag_zero () = 54 + (* Tag = 0, wire type = 0. Encoded as varint 0 = 0x00. *) 55 + match Protobuf.decode_string test1_codec "\x00" with 56 + | Error _ -> () 57 + | Ok _ -> Alcotest.fail "tag field=0 must be rejected" 58 + 59 + (* --- CVE class: unsupported wire type (3, 4, 6, 7 are reserved/deprecated). --- *) 60 + 61 + let test_unsupported_wire_type () = 62 + (* Tag 1 wire type 3 (start_group, deprecated). *) 63 + match Protobuf.decode_string test1_codec "\x0b\x00" with 64 + | Error _ -> () 65 + | Ok _ -> Alcotest.fail "wire type 3 must be rejected" 66 + 67 + (* [CVE class deferred: deep nesting DoS.] A recursive nested-message 68 + codec would need a [Lazy.t] thunk or a [ref] that OCaml's [let rec] 69 + restriction rejects cleanly in the call site here. A depth limit in 70 + the main [Protobuf.decode] is not yet implemented; a malicious input 71 + with ~1000 levels of nested Length_delim fields can stack-overflow 72 + the OCaml runtime. Tracked as a TODO. *) 73 + 74 + (* --- CVE class: wire type mismatch (field declared as varint, wire has 75 + length-delim). Decoder should reject cleanly. --- *) 76 + 77 + let test_wire_type_mismatch () = 78 + (* test1's field 1 is int32 (varint). Emit it as length-delim. *) 79 + let bad = 80 + "\x0a\x00" 81 + (* tag 1 wire type 2, length 0 *) 82 + in 83 + match Protobuf.decode_string test1_codec bad with 84 + | Error _ -> () 85 + | Ok _ -> Alcotest.fail "wire type mismatch must be rejected" 86 + 87 + (* --- Empty input: decode to scalar defaults, proto3-style. --- *) 88 + 89 + let test_empty_input () = 90 + match Protobuf.decode_string test1_codec "" with 91 + | Error msg -> 92 + Alcotest.failf "empty input should succeed with defaults: %s" msg 93 + | Ok r -> Alcotest.(check int32) "a defaults to 0" 0l r.a 94 + 95 + (* --- Decoder must not read past message boundary: trailing unused bytes 96 + cause parse_wire's end_-check to reject. --- *) 97 + 98 + let test_overrun_rejected () = 99 + (* A single valid field followed by a half-read varint should be 100 + caught by parse_wire's boundary check. *) 101 + let good_prefix = Protobuf.encode_string test1_codec { a = 1l } in 102 + let with_trailer = 103 + good_prefix ^ "\x80" 104 + (* truncated continuation *) 105 + in 106 + match Protobuf.decode_string test1_codec with_trailer with 107 + | Error _ -> () 108 + | Ok _ -> Alcotest.fail "trailing truncated varint must be rejected" 109 + 110 + (* --- Length-delim field with length pointing past end. --- *) 111 + 112 + let test_length_points_past_end () = 113 + (* Tag 14 (str), length 100, but only 2 bytes follow. *) 114 + let bad = "\x72\x64ab" in 115 + match Protobuf.decode_string test1_codec bad with 116 + | Error _ -> () 117 + | Ok _ -> Alcotest.fail "length past end must be rejected" 118 + 119 + (* --- Packed field with corrupt body: outer length is valid, inner 120 + varint stream is truncated. --- *) 121 + 122 + type packed_i32 = { xs : int32 list } 123 + 124 + let packed_codec : packed_i32 Protobuf.t = 125 + let open Protobuf.Message in 126 + finish 127 + (let* xs = Protobuf.Message.packed 1 (fun r -> r.xs) Protobuf.int32 in 128 + return { xs }) 129 + 130 + let test_packed_corrupt_body () = 131 + (* Tag 1, wire 2, length 2, then \x80\x80 (unterminated varint inside 132 + the packed blob). *) 133 + let bad = "\x0a\x02\x80\x80" in 134 + match Protobuf.decode_string packed_codec bad with 135 + | Error _ -> () 136 + | Ok _ -> Alcotest.fail "corrupt packed body must be rejected" 137 + 138 + (* --- Huge repeated count: attacker repeats a small tag a million times. Decoder 139 + must handle it (correctness, not DoS per se: the input itself is a million 140 + bytes, so the attacker pays for the cost). This test asserts we don't 141 + quadratic-blow-up. --- *) 142 + 143 + type rep = { tags : string list } 144 + 145 + let rep_codec : rep Protobuf.t = 146 + let open Protobuf.Message in 147 + finish 148 + (let* tags = repeated 1 (fun r -> r.tags) Protobuf.string in 149 + return { tags }) 150 + 151 + let test_many_repeated () = 152 + let n = 10_000 in 153 + let buf = Buffer.create (3 * n) in 154 + for _ = 1 to n do 155 + Protobuf.Wire.write_tag buf ~field_number:1 156 + ~wire_type:Protobuf.Wire.Length_delimited; 157 + Protobuf.Wire.write_string buf "x" 158 + done; 159 + let wire = Buffer.contents buf in 160 + match Protobuf.decode_string rep_codec wire with 161 + | Error msg -> Alcotest.failf "many-repeated rejected: %s" msg 162 + | Ok r -> 163 + Alcotest.(check int) "count" n (List.length r.tags); 164 + Alcotest.(check string) "first" "x" (List.hd r.tags) 165 + 166 + let suite = 167 + ( "hostile", 168 + [ 169 + Alcotest.test_case "huge length prefix DoS" `Quick test_huge_length_prefix; 170 + Alcotest.test_case "over-long varint" `Quick test_overlong_varint; 171 + Alcotest.test_case "truncated tag" `Quick test_truncated_tag; 172 + Alcotest.test_case "reserved tag 0" `Quick test_reserved_tag_zero; 173 + Alcotest.test_case "unsupported wire type" `Quick 174 + test_unsupported_wire_type; 175 + Alcotest.test_case "wire type mismatch" `Quick test_wire_type_mismatch; 176 + Alcotest.test_case "empty input -> defaults" `Quick test_empty_input; 177 + Alcotest.test_case "overrun rejected" `Quick test_overrun_rejected; 178 + Alcotest.test_case "length past end" `Quick test_length_points_past_end; 179 + Alcotest.test_case "packed corrupt body" `Quick test_packed_corrupt_body; 180 + Alcotest.test_case "many repeated (10k)" `Quick test_many_repeated; 181 + ] )
+1
test/test_hostile.mli
··· 1 + val suite : string * unit Alcotest.test_case list