~cypheon/ecertmon

c41253492b6d9ba83cec931602aaa355908bdaed — Johann Rudloff 3 months ago b4a31be
Allow a certain threshold of errors before reporting invalid cert

A `max_error_count` config entry can be provided, if fewer than that
amount of errors occur, the certificate validity will be reported as the
"old" value (before the first error. If more than `max_error_count`
errors occur in a row, validity is reported as `0` (before this commit,
that was the default behaviour after the first error).
3 files changed, 34 insertions(+), 10 deletions(-)

M config/certmon.sample.config
M src/cert_scanner.erl
M src/metrics_handler.erl
M config/certmon.sample.config => config/certmon.sample.config +1 -0
@@ 1,5 1,6 @@
{port, 9101}.
{scan_interval, 30000}.
{max_error_count, 10}.
{targets, [
           {"neo.sinyax.net", 443}
          ]

M src/cert_scanner.erl => src/cert_scanner.erl +26 -5
@@ 18,7 18,11 @@
          hostname,
          port,
          timer,
          validity
          validity,
          % count of errors since last success
          errorCount,
          % total count of errors
          totalErrors
}).

%% API.


@@ 38,11 42,16 @@ init([{Hostname, Port}]) ->
          hostname = Hostname,
          port = Port,
          timer = Timer,
          validity = unknown
          validity = unknown,
          errorCount = 0,
          totalErrors = 0
         }}.

handle_call({get_status}, _From, State) ->
  {reply, State#state.validity, State};
  {reply, [
           State#state.validity,
           {errors, State#state.totalErrors}
          ], State};
handle_call(_Request, _From, State) ->
  {reply, ignored, State}.



@@ 83,10 92,22 @@ do_scan(State) ->
      RemainingDays = (NotAfterSeconds - NowSeconds) / (24 * 3600),
      logger:debug("peer cert ~s valid until: ~p (unix epoch: ~p, remaining days: ~p)~n",
                   [Hostname, NotAfter, NotAfterEpoch, RemainingDays]),
      {noreply, State#state{validity = {valid, NotAfterEpoch}}};
      {noreply, State#state{validity = {valid, NotAfterEpoch}, errorCount = 0}};
    {error, Reason} ->
      logger:warning("failed to connect to ~p: ~p", [Hostname, Reason]),
      {noreply, State#state{validity = {error, Reason}}}
      {ok, MaxErrorCount} = application:get_env(max_error_count),

      % if max error count is not reached, we keep the old validity
      ReportedValidity = if
                           State#state.errorCount < MaxErrorCount ->
                             State#state.validity;
                           true -> {error, Reason}
                         end,
      {noreply, State#state{
                  validity = ReportedValidity,
                  errorCount = State#state.errorCount + 1,
                  totalErrors = State#state.totalErrors + 1
      }}
  end.

%% utilities

M src/metrics_handler.erl => src/metrics_handler.erl +7 -5
@@ 6,16 6,18 @@
format_metric({Hostname, Port}) ->
  Target = {Hostname, Port},
  {ok, ScannerPid} = certmon_sup:get_scanner_pid(Target),
  Valid = try gen_server:call(ScannerPid, {get_status}) of
            {valid, Epoch} -> Epoch;
            _ -> 0
  Response = try gen_server:call(ScannerPid, {get_status}) of
            R -> R
          catch
            exit:Reason ->
              logger:error("failed to get status for ~p: Reason: ~p", [{Hostname, Port}, Reason]),
              0
          end,
  [<<"tls_cert_expiry{hostname=\"">>, Hostname, <<"\", port=\"">>,
   io_lib:format("~p", [Port]), <<"\"} ">>, io_lib:format("~p\n", [Valid])
  Valid = proplists:get_value(valid, Response, 0),
  Errors = proplists:get_value(errors, Response, 0),
  [
   io_lib:format("tls_cert_expiry{hostname=\"~p\", port=\"~p\"} ~p\n", [Hostname, Port, Valid]),
   io_lib:format("tls_cert_errors{hostname=\"~p\", port=\"~p\"} ~p\n", [Hostname, Port, Errors])
  ].

init(Req0, State) ->