···44 extract::{Extension, Path},
55 response::Html,
66};
77+use http::header::HeaderMap;
78use lazy_static::lazy_static;
89use prometheus::{opts, register_int_counter_vec, IntCounterVec};
910use std::sync::Arc;
···2526 Ok(Html(result))
2627}
27282828-#[instrument(skip(state))]
2929+#[instrument(skip(state, headers))]
2930pub async fn post_view(
3031 Path(name): Path<String>,
3132 Extension(state): Extension<Arc<State>>,
3333+ headers: HeaderMap,
3234) -> Result {
3335 let mut want: Option<Post> = None;
3436···3840 }
3941 }
40424343+ let referer = if let Some(referer) = headers.get(http::header::REFERER) {
4444+ let referer = referer.to_str()?.to_string();
4545+ Some(referer)
4646+ } else {
4747+ None
4848+ };
4949+4150 match want {
4251 None => Err(PostNotFound(name).into()),
4352 Some(post) => {
···4655 .inc();
4756 let body = templates::Html(post.body_html.clone());
4857 let mut result: Vec<u8> = vec![];
4949- templates::talkpost_html(&mut result, post, body)?;
5858+ templates::talkpost_html(&mut result, post, body, referer)?;
5059 Ok(Html(result))
5160 }
5261 }
+423
talks/conf42-static-analysis.markdown
···11+---
22+title: How Static Code Analysis Prevents You From Waking Up at 3AM With Production on Fire
33+date: 2022-06-09
44+slides_link: https://cdn.xeiaso.net/file/christine-static/talks/Conf42+SRE+2022.pdf
55+---
66+77+# How Static Code Analysis Prevents You From Waking Up at 3AM With Production on Fire
88+99+<style>
1010+img {
1111+ display: block;
1212+ margin-left: auto;
1313+ margin-right: auto;
1414+}
1515+</style>
1616+1717+<center><iframe width="560" height="315" src="https://www.youtube.com/embed/cVUrScvthqs"
1818+title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
1919+allowfullscreen></iframe></center>
2020+2121+<xeblog-conv name="Cadey" mood="coffee">The talk video will be live at 2022 M06
2222+10 at 13:00 EDT. It will not work if you are reading this at the exact
2323+time of release or before it is released via Patreon.</xeblog-conv>
2424+2525+
2626+2727+Hi, I’m Xe Iaso and today I’m going to talk about static analysis and how it
2828+helps you engineer more reliable systems. This will help you make it harder for
2929+incorrect code to blow up production at 3AM. There are a lot of tools out there
3030+that can do this for a variety of languages, however I’m going to focus on Go
3131+because that is what I am an expert in. In this talk I’ll cover the problem
3232+space, some solutions you can apply today and how you can work with people to
3333+engineer more reliable systems.
3434+3535+
3636+3737+As I said, I’m Xe. I’m the Archmage of Infrastructure at Tailscale. I’ve been an
3838+SRE for long enough that I have moved over into developer relations. As a
3939+disclaimer, this talk may contain opinions. None of these opinions are of my
4040+employer.
4141+4242+I’ll have a recording of this talk, slides, speaker notes, and a transcript of
4343+up in a day or two after the conference. The QR code in the corner of the screen
4444+will take you to my blog.
4545+4646+
4747+4848+When starting to think about the problem, I find it helps to start thinking
4949+about the problem space. This usually means thinking about the total problem at
5050+an incredibly high level.
5151+5252+So let’s think about the problem space of compilers. At the highest possible
5353+level, a compiler can take literally anything as input and maybe produce an
5454+output.
5555+5656+
5757+5858+A compiler’s job is to take this anything, see if it matches a set of rules and
5959+then produce an output of some kind. In the case of the Go compiler, this means
6060+that the input needs to match the rules that the Go language has defined in its
6161+specification.
6262+6363+
6464+6565+This human-readable specification outlines core rules of the Go language. These
6666+include things like every `.go` file needs to be in a package, the need to
6767+declare variables before using them, what core types are in the language, how to
6868+deal with slices, etc.
6969+7070+However this specification doesn’t define what _correct_ Go code is. It only
7171+defines what _valid_ Go code is. This is normal for specifications of this kind,
7272+ensuring correctness is an active field of research in computer science that
7373+small scrappy startups like Google, Microsoft and Apple struggle with.
7474+7575+
7676+7777+As a result though, you can’t rely on the compiler itself from stopping
7878+incorrect code to be deployed into production. A lot of trivial errors will be
7979+stopped in the process, but it won’t stop more subtle errors. This is an
8080+example of the kind of error that the Go compiler can catch by itself, if you
8181+declare a value as an integer you can’t then put a string in it. They are
8282+different types and the compiler will reject it.
8383+8484+
8585+8686+I know one of you out there is probably thinking something like “What about
8787+other languages like Rust and Haskell? Aren’t those compilers known for
8888+correctness?”
8989+9090+
9191+9292+That’s a good point, there are other languages that have more strict rules like
9393+linear types and explicitly marking poking the outside world. However the kinds
9494+of errors that are brought up in this talk can still happen in those languages,
9595+even if it’s more difficult to do that by accident.
9696+9797+
9898+9999+Static analysis on top of your existing compiler lets you move closer to
100100+correctness without going the maximalist route like when using Rust or Haskell.
101101+102102+
103103+104104+It’s a balance between pragmatism and correctness. The pragmatic solution and
105105+the correct solution are always in conflict, so you need to find a way down the
106106+middle.
107107+108108+
109109+110110+In general, proving everything is correct with static analysis is impossible. It
111111+takes a theoretically infinite amount of time to tell if absolutely every facet
112112+of the code is correct in every single way. This is a case where the perfect is
113113+the enemy of the good, so here are some patterns for things that can be proven
114114+with static analysis in Go:
115115+116116+
117117+118118+* Forgetting to close an HTTP response body
119119+* Making typos in struct tags
120120+* Ensuring that cancellable contexts get cancelled in trivially provable ways
121121+* Writing invalid time formats
122122+* Writing an invalid regular expression that would otherwise blow up at runtime
123123+124124+
125125+126126+These kinds of things are easy to prove and are enabled by default in `go vet`
127127+and staticcheck.
128128+129129+Also for the record, incorrect code won’t explode instantly upon it being run.
130130+The devil is in the details of how it is incorrect and how those things can pile
131131+up to create issues downstream. Incorrect code can also confuse you while trying
132132+to debug it, which can make you waste time you could spend doing anything else.
133133+134134+
135135+136136+This is an example of Go code that will compile, will likely work, but is incorrect.
137137+138138+
139139+140140+This is incorrect because the HTTP response is read from, but never closed.
141141+Failing to do this in Go will cause you to leak the resources associated with
142142+the HTTP connection. When you close the response, it releases the connection so
143143+that it can be used for other HTTP actions.
144144+145145+If you don’t do this, you can easily run into a state where your server
146146+application will run out of available sockets at 3AM. So you may be tempted to
147147+fix it like this:
148148+149149+
150150+151151+However this is incorrect too. Look at where the `defer` is called.
152152+153153+Let’s think about how the program flow will work. I’m going to translate this
154154+into a diagram of how this program will be executed.
155155+156156+
157157+158158+This flowchart is another way to think about how this program is being executed.
159159+It starts on the left side and flows to the end on the right.
160160+161161+
162162+163163+In this case we start with the http dot Get call and then defer closing the
164164+response body. Then we check to see if there was an error or not.
165165+166166+
167167+168168+If there wasn’t an error, we can use the response and do something useful, then
169169+the response body closes automatically due to the deferred close. Everything
170170+works as expected.
171171+172172+
173173+174174+However if there was an error, something different happens. The error is
175175+returned and then the scheduled Close call runs. The Close call assumes that the
176176+response is valid, but it’s not. This results in the program panicking which is
177177+a crash at 3AM. This is the kind of place that static analysis comes in to save
178178+you. Let’s take a look at what `go vet` says about this code:
179179+180180+
181181+182182+It caught that error! To fix this we need to move the `defer` call to after the
183183+error check like this:
184184+185185+
186186+187187+The response body is closed after we know it’s usable. This will work as we
188188+expect in production. This is an example of how trivial errors can be fixed with
189189+a little extra tooling without having to use an entirely maximalist approach.
190190+191191+
192192+193193+If you use `go test` then a large amount of `go vet` checks are run by default.
194194+This covers a wide variety of common issues with trivial fixes that help move
195195+your code towards the corresponding Go idioms. It’s limited to the subset of
196196+tests that aren’t known to have false positives, so if you want to have more
197197+assurance you will need to run `go vet` in your continuous integration step.
198198+199199+
200200+201201+<xeblog-conv name="Mara" mood="hmm">If these are so trivially detectable, why
202202+isn’t this part of the normal `go build` flow?</xeblog-conv>
203203+204204+
205205+206206+The reason this isn’t done by default is kind of a matter of philosophy. Go
207207+isn’t a language that wants to make it impossible to write buggy code. Go just
208208+wants to give you tools to make your life easier.
209209+210210+In the Go team’s view, they would rather buggy code get compiled than have the
211211+compiler reject your code on accident.
212212+213213+It’s the result a philosophy of trusting that there are gaps between the
214214+programmer and production servers. During those gaps there are tools like
215215+Staticcheck and `go vet` in addition to human review.
216216+217217+
218218+219219+Here’s an example of a more complicated problem that Staticcheck can catch.
220220+221221+
222222+223223+Go lets you make variables that are scoped to if statements. This lets you write
224224+code like this:
225225+226226+
227227+228228+Which is shorthand for writing out something like this:
229229+230230+
231231+232232+This does the same thing, but it looks a bit more ugly. The `err` value isn’t in
233233+scope at the end of the inline block, so it will be dropped by the garbage
234234+collector.
235235+236236+
237237+238238+However let’s also consider the other important part of this snippet: variable shadowing.
239239+240240+
241241+242242+We have two different variables named `x` and they are different types and
243243+declared at different places. To help tell them apart I’ve coloured the inner
244244+one yellow and the outer one red.
245245+246246+In a type assertion like this the red variable is not an `int` but the yellow
247247+variable is an `int` that might have failed to assert down. If it fails to
248248+assert down, then the yellow `x` variable will always be an `int` have the value
249249+`0`. This is probably not what you want, given that the log call with `%T`
250250+format specifier would let you know what type the red `x` variable was.
251251+252252+When this code is run, you will get an error message like this:
253253+254254+
255255+256256+This will confuse the living hell out of you. The correct fix here is to rename
257257+the int version of `x`. You could do this in a few ways, but here’s a valid
258258+approach:
259259+260260+
261261+262262+This will get the correct result. You would need to change the `ok` branch of
263263+the `if` statement to use `xInt` instead of `x`, but the Go language server can
264264+usually automate this (in Emacs you’d press `M-x` and type in `lsp-rename` and
265265+hit enter).
266266+267267+There are a bunch of other checks that Staticcheck runs by default and I could
268268+easily talk about them for a few hours, but I’m gonna focus on one of the more
269269+interestingly subtle checks.
270270+271271+
272272+273273+In Go it’s a common pattern to write custom error types. With Go interfaces and
274274+their “duck typing”, anything that matches the definition of the `error`
275275+interface is able to be used as an `error` value.
276276+277277+
278278+279279+The type Failure has an Error method, which means that we can treat it as an
280280+error.
281281+282282+
283283+284284+However the receiver of the function is a pointer value. Normally this means a
285285+few things, but in this case it means that the receiver may be nil.
286286+287287+
288288+289289+Because of this we can return a nil value of Failure, but then when you try to
290290+use it from Go it will explode at runtime:
291291+292292+
293293+294294+Boom! It crashed! Segfault!
295295+296296+
297297+298298+This happens because under the hood each interface value is a box. This box
299299+contains the type of the value in the box and a pointer to the actual value
300300+itself. But, this box will always exist even if the underlying value is `nil`.
301301+302302+This is always frustrating when you run into it, but let’s see what Staticcheck
303303+says when you run it against this code:
304304+305305+
306306+307307+Staticcheck will reject it. If this code was checked into source control and
308308+Staticcheck was run in CI, tests would fail.
309309+310310+
311311+312312+The correct version of doWork should look like this.
313313+314314+
315315+316316+Note how I changed the failure case to use an untyped `nil`. This prevents the
317317+`nil` value from being boxed into an interface. This will do the right thing.
318318+319319+
320320+321321+This will help you ensure that this kind of code never enters production so it
322322+cannot fail at untold hours of the night while you are sleeping.
323323+324324+
325325+326326+As SREs, we tend to sleep very little as is. Statistically we have higher rates
327327+of burnout, mind fog, fatigue and likelihood of turning into angry, sad people
328328+as we do this job longer and longer. Especially if the culture of a company is
329329+broken enough that you end up being on call during sleeping hours.
330330+331331+This is not healthy. It is not sustainable for us to be woken up at obscene
332332+hours of the night because of trivial and preventable errors. If we get woken up
333333+in the night, it should be for things that are measurably novel and not caused
334334+by errors that should have never been allowed to be deployed in the first place.
335335+336336+
337337+338338+I don’t think I’ve heard my pager sound in years by this point, but the last
339339+time I heard it I almost had a full blown panic attack. I have been in the kind
340340+of place where burnout from the pager severely affected my health.
341341+342342+I’m still recovering from the after-effects of that tour of SRE duty, and it has
343343+resulted in me making permanent career changes so that I am never put in that
344344+kind of position again. I don’t wish this hell on anyone.
345345+346346+
347347+348348+Normally things can feel like this when you are an SRE put in the line of pager
349349+fire. It feels like both fixing production and being able to get more sleep are
350350+unworkable and that you would have severe difficulty getting from one side to
351351+the other.
352352+353353+
354354+355355+Adding static analysis to your continuous integration setup can allow you to
356356+walk down a middle path between these two extremes. It is not going to be
357357+perfect, however gradually things will get better.
358358+359359+Trivial errors will be blocked from going into production and you will be able
360360+to sleep easier.
361361+362362+
363363+364364+The benefits of being able to rest easier like this are numerous and difficult
365365+to summarize in the short amount of time I have left. It could save your
366366+relationship with your loved ones. It could prevent people near you from
367367+resenting you.
368368+369369+It could be the difference between a long and happy career or having to drop out
370370+of tech at 25; burnt out to a crisp and unable to do much of anything.
371371+372372+
373373+374374+It could be the difference between life and an early, untimely death from a
375375+preventable heart attack.
376376+377377+
378378+379379+In talks like these it’s easy to ignore the fact the people that are responsible
380380+for making sure services are reliable are that. Human. Company culture may get
381381+in the way, there may be a lack of people that are willing or able to take the
382382+pager rotation.
383383+384384+
385385+386386+However when the machines come to take our jobs, I hope this one is one of the
387387+first that they take.
388388+389389+
390390+391391+In the meantime, all we can do is get towards a more sustainable future. And the
392392+best thing we can do is make sure people sleep well without having to worry
393393+about being woken up because of errors that tools like Staticcheck can block
394394+from getting into production.
395395+396396+
397397+398398+If you use Go in production, I highly suggest using Staticcheck. If you find it
399399+useful, sponsor Dominik on GitHub. Software like this is complicated to develop
400400+and the best way to ensure Dominik can keep developing it is to pay him for his
401401+efforts. The better he sleeps, the better you sleep as an SRE.
402402+403403+
404404+405405+As for other languages, I don't know what the best practices are. You will have
406406+to do research on this, you may have to work together with coworkers to figure
407407+out what would be the best option for your team. It is worth the effort though.
408408+This helps you make a better product for everyone, and it's worth the teething
409409+pains at first.
410410+411411+
412412+413413+I’m almost at the end of the presentation, but I wanted to give a special shout
414414+out to all of these people who helped make this talk a reality. I want to also
415415+give out a special shout out to my coworkers at Tailscale that let me load shed
416416+so I could focus on making this talk shine.
417417+418418+
419419+420420+Thanks for watching! I’ll stick around in the chat for questions, but if I miss
421421+your question and you really want an answer to it, please email it to
422422+code42sre2022@xeserv.us. I’m happy to answer questions and I enjoy writing up
423423+responses.
+6-12
templates/talkpost.rs.html
···11-@use super::{header_html, footer_html, mara};
22-@use crate::post::Post;
11+@use super::{header_html, footer_html};
22+@use crate::{post::Post, tmpl::nag};
33@use chrono::prelude::*;
4455-@(post: Post, body: impl ToHtml)
55+@(post: Post, body: impl ToHtml, referer: Option<String>)
6677@:header_html(Some(&post.front_matter.title.clone()), None)
88···4646 @}
4747</script>
48484949-@if Utc::today().num_days_from_ce() < post.date.num_days_from_ce() {
5050-<div class="warning">
5151- @:mara("hacker", "Mara", Html(format!(r#"Hey, this post is set to go live to the public on {} UTC. Right now you are reading a pre-publication version of this post. Please do not share this on social media. This post will automatically go live for everyone on the intended publication date. If you want access to these posts, please join the <a href="https://patreon.com/cadey">Patreon</a>. It helps me afford the copyeditor that I contract for the technical content I write."#, post.detri())))
5252-</div>
5353-} else {
5454-<script async src="https://media.ethicalads.io/media/client/ethicalads.min.js"></script>
5555-}
4949+@Html(nag::referer(referer).0)
5050+5151+@Html(nag::prerelease(&post).0)
56525753@body
58545955<a href="@post.front_matter.slides_link.as_ref().unwrap()">Link to the slides</a>
6060-6161-<div data-ea-publisher="christinewebsite" data-ea-type="text"></div>
62566357<hr />
6458