Clean up detection sentry events + tests (#5833)

* add module name to service_error when check times out

Otherwise, it can sometimes remain unclear in the diagnostics, whether
it was InstallationV2 or InstallationV2CacheBust that timed out.

* Remove duplicate timeout logic

The current production logs show two types of verification timeouts:

* service_error: "Unhandled Browserless response status: 408" (vast
  majority of cases)
* service_error: :timeout (only a few cases)

The latter happens when we hit the Req receive_timeout
(endpoint_timeout + 2s). I've seen Browserless not respect the timeout
param from time to time, so it's better to keep the timeout logic
"in-house" only.

* make service_error into a map with code and extra

* interpret temporary service errors

...but still consider them "unhandled" for telemetry, also notifying Sentry
and logging the warning.

* separate sentry messages (verification)

* make Verification.ChecksTest more DRY

* organize tests into describe blocks

* test verification telemetry and logging

* fix codespell

* get rid of legacy verification

* rename Checks.InstallationV2 -> Checks.VerifyInstallation

* delete Live.Installation and rename Live.InstallationV2 -> Live.Installation

* rename installationv2 (live) files as well

* delete old change-domain routes

Also rename current liveview modules and routes, removing the v2 suffix

* rename domain_change_v2 files, removing v2 suffix

* remove legacy JS verifier code

Also fix dockerignore and elixir.yml referencing a wrong priv path

* rename verification_v2_test -> verification_test

* remove v2 prefix from logs and sentry messages

* clean up duplicate external_sites_controller_test.exs tests

* remove flag

* fix typespec

* pass timeout as query param to Browserless too

* Fixup external sites controller test module (#5826)

* fix test description

* clean up detection sentry events + tests

* improve naming

---------

Co-authored-by: Artur Pata <artur.pata@gmail.com>
This commit is contained in:
RobertJoonas 2025-10-27 10:31:24 +00:00 committed by GitHub
parent a83b4f3583
commit 7540511deb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 259 additions and 42 deletions

View File

@ -37,8 +37,8 @@ defmodule Plausible.InstallationSupport.Detection.Checks do
)
end
def telemetry_event_handled(), do: [:plausible, :detection, :handled]
def telemetry_event_unhandled(), do: [:plausible, :detection, :unhandled]
def telemetry_event_success(), do: [:plausible, :detection, :success]
def telemetry_event_failure(), do: [:plausible, :detection, :failure]
def interpret_diagnostics(%State{
diagnostics: diagnostics,
@ -47,24 +47,36 @@ defmodule Plausible.InstallationSupport.Detection.Checks do
}) do
result = Detection.Diagnostics.interpret(diagnostics, url)
case result.data do
%{unhandled: true} ->
Sentry.capture_message("Unhandled case for detection",
extra: %{
message: inspect(diagnostics),
url: url,
hash: :erlang.phash2(diagnostics)
}
)
{failed?, trigger_sentry?, msg} =
case result do
%{ok?: true} ->
{false, false, nil}
Logger.warning(
"[DETECTION] Unhandled case (data_domain='#{data_domain}'): #{inspect(diagnostics)}"
)
%{data: %{failure: :customer_website_issue}} ->
{true, false, "Failed due to an issue with the customer website"}
:telemetry.execute(telemetry_event_unhandled(), %{})
%{data: %{failure: :browserless_issue}} ->
{true, true, "Failed due to a Browserless issue"}
_ ->
:telemetry.execute(telemetry_event_handled(), %{})
_unknown_failure ->
{true, true, "Unknown failure"}
end
if failed? do
:telemetry.execute(telemetry_event_failure(), %{})
Logger.warning("[DETECTION] #{msg} (data_domain='#{data_domain}'): #{inspect(diagnostics)}")
else
:telemetry.execute(telemetry_event_success(), %{})
end
if trigger_sentry? do
Sentry.capture_message("[DETECTION] #{msg}",
extra: %{
message: inspect(diagnostics),
url: url,
hash: :erlang.phash2(diagnostics)
}
)
end
result

View File

@ -24,7 +24,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
} = diagnostics,
_url
) do
get_result("gtm", diagnostics)
success("gtm", diagnostics)
end
def interpret(
@ -34,7 +34,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
} = diagnostics,
_url
) do
get_result(
success(
"wordpress",
diagnostics
)
@ -47,7 +47,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
} = diagnostics,
_url
) do
get_result("npm", diagnostics)
success("npm", diagnostics)
end
def interpret(
@ -56,29 +56,46 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
} = diagnostics,
_url
) do
get_result(PlausibleWeb.Tracker.fallback_installation_type(), diagnostics)
success(PlausibleWeb.Tracker.fallback_installation_type(), diagnostics)
end
def interpret(%__MODULE__{service_error: %{code: code}}, _url)
when code in [:domain_not_found, :invalid_url] do
failure(:customer_website_issue)
end
def interpret(%__MODULE__{service_error: %{code: code}}, _url)
when code in [:bad_browserless_response, :browserless_timeout] do
failure(:browserless_issue)
end
def interpret(
%__MODULE__{
service_error: service_error
},
%__MODULE__{service_error: %{code: :browserless_client_error, extra: extra}},
_url
)
when service_error in [:domain_not_found, :invalid_url],
do: %Result{ok?: false, data: nil, errors: [Atom.to_string(service_error)]}
) do
cond do
String.contains?(extra, "net::") ->
failure(:customer_website_issue)
def interpret(%__MODULE__{} = _diagnostics, _url), do: unhandled_case()
String.contains?(String.downcase(extra), "execution context") ->
failure(:customer_website_issue)
defp unhandled_case() do
true ->
failure(:unknown_issue)
end
end
def interpret(%__MODULE__{} = _diagnostics, _url), do: failure(:unknown_issue)
defp failure(reason) do
%Result{
ok?: false,
data: %{unhandled: true},
errors: ["Unhandled detection case"]
data: %{failure: reason},
errors: [reason]
}
end
defp get_result(suggested_technology, diagnostics) do
defp success(suggested_technology, diagnostics) do
%Result{
ok?: true,
data: %{

View File

@ -185,15 +185,15 @@ defmodule Plausible.PromEx.Plugins.PlausibleMetrics do
on_ee(
do:
counter(
metric_prefix ++ [:detection, :handled],
event_name: InstallationSupport.Detection.Checks.telemetry_event_handled()
metric_prefix ++ [:detection, :success],
event_name: InstallationSupport.Detection.Checks.telemetry_event_success()
)
),
on_ee(
do:
counter(
metric_prefix ++ [:detection, :unhandled],
event_name: InstallationSupport.Detection.Checks.telemetry_event_unhandled()
metric_prefix ++ [:detection, :failure],
event_name: InstallationSupport.Detection.Checks.telemetry_event_failure()
)
),
on_ee(

View File

@ -10,14 +10,17 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
alias Plausible.InstallationSupport.Result
use Plausible.Test.Support.DNS
import Plug.Conn
import ExUnit.CaptureLog
@moduletag :capture_log
@expected_domain "example.com"
@working_url "https://#{@expected_domain}"
describe "running detection" do
test "handles wordpress detection, retrying on 429" do
expected_domain = "example.com"
url_to_verify = nil
test = self()
stub_lookup_a_records(expected_domain)
stub_lookup_a_records(@expected_domain)
get_context_from_body = fn conn ->
{:ok, body, _conn} = Plug.Conn.read_body(conn)
@ -25,7 +28,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
end
detection_counter =
stub_detection_result(fn conn, request_i ->
stub_detection_result_with_counter(fn conn, request_i ->
send(test, get_context_from_body.(conn))
case request_i do
@ -60,7 +63,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
suggested_technology: "wordpress"
}
} ==
Checks.run(url_to_verify, expected_domain,
Checks.run(url_to_verify, @expected_domain,
report_to: nil,
async?: false,
slowdown: 0
@ -76,7 +79,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
"url" =>
^any(
:string,
~r/https:\/\/#{expected_domain}\?plausible_verification=\d+$/
~r/https:\/\/#{@expected_domain}\?plausible_verification=\d+$/
),
"userAgent" =>
"Plausible Verification Agent - if abused, contact support@plausible.io"
@ -86,7 +89,192 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
end
end
defp stub_detection_result(handler) do
describe "observability" do
setup %{test: test, test_pid: test_pid} do
:telemetry.attach_many(
"#{test}-telemetry-handler",
[
Checks.telemetry_event_success(),
Checks.telemetry_event_failure()
],
fn event, %{}, _, _ ->
send(test_pid, {:telemetry_event, event})
end,
%{}
)
end
test "successful detection -> no logs, telemetry :success" do
stub_lookup_a_records(@expected_domain)
detection_stub =
json_response_detection_stub(%{
"completed" => true,
"v1Detected" => nil,
"gtmLikely" => false,
"npm" => false,
"wordpressLikely" => true,
"wordpressPlugin" => false
})
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log == ""
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_success()
end
test "domain not found -> customer website issue" do
stub_lookup_a_records(@expected_domain, [])
detection_counter =
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, fn _conn ->
raise "This check should've been skipped"
end)
state =
Checks.run(@working_url, @expected_domain,
report_to: nil,
async?: false,
slowdown: 0
)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert detection_counter
assert log =~ "[DETECTION] Failed due to an issue with the customer website"
assert log =~ "service_error: %{code: :domain_not_found}"
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
for msg <- ["Execution context destroyed", "net::ERR_CONNECTION_REFUSED"] do
test "failure due to a known :browserless_client_error (#{msg}) -> customer website issue" do
stub_lookup_a_records(@expected_domain)
detection_stub =
json_response_detection_stub(%{
"completed" => false,
"error" => %{"message" => unquote(msg)}
})
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log =~ "[DETECTION] Failed due to an issue with the customer website"
assert log =~ "code: :browserless_client_error"
assert log =~ ~s|extra: "#{unquote(msg)}"|
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
end
test "failure due to an unknown :browserless_client_error -> unknown failure" do
stub_lookup_a_records(@expected_domain)
detection_stub =
json_response_detection_stub(%{
"completed" => false,
"error" => %{"message" => "something unexpected"}
})
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log =~ "[DETECTION] Unknown failure"
assert log =~ "code: :browserless_client_error"
assert log =~ ~s|extra: "something unexpected"|
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
test "failure hitting the catch-all interpret clause -> unknown failure" do
stub_lookup_a_records(@expected_domain)
detection_stub = fn conn ->
Req.Test.transport_error(conn, :econnrefused)
end
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log =~ "[DETECTION] Unknown failure"
assert log =~ "code: :req_error"
assert log =~ ~s|extra: :econnrefused|
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
test "failure due to a flaky browserless issue -> browserless issue" do
stub_lookup_a_records(@expected_domain)
detection_stub = fn conn ->
conn
|> put_resp_content_type("text/html")
|> send_resp(400, "some error message")
end
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log =~ "[DETECTION] Failed due to a Browserless issue"
assert log =~ "code: :bad_browserless_response"
assert log =~ "extra: 400"
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
test "failure due to a browserless timeout -> browserless issue" do
stub_lookup_a_records(@expected_domain)
detection_stub = fn conn ->
Req.Test.transport_error(conn, :timeout)
end
state = run_checks(detection_stub)
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
assert log =~ "[DETECTION] Failed due to a Browserless issue"
assert log =~ "code: :browserless_timeout"
assert_receive {:telemetry_event, telemetry_event}
assert telemetry_event == Checks.telemetry_event_failure()
end
end
defp json_response_detection_stub(js_data) do
fn conn ->
conn
|> put_resp_content_type("application/json")
|> send_resp(200, Jason.encode!(%{"data" => js_data}))
end
end
defp run_checks(detection_stub) do
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, detection_stub)
Checks.run(@working_url, @expected_domain,
report_to: nil,
async?: false,
slowdown: 0
)
end
defp stub_detection_result_with_counter(handler) do
counter = :atomics.new(1, [])
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, fn conn ->