Clean up detection sentry events + tests (#5833)
* add module name to service_error when check times out Otherwise, it can sometimes remain unclear in the diagnostics, whether it was InstallationV2 or InstallationV2CacheBust that timed out. * Remove duplicate timeout logic The current production logs show two types of verification timeouts: * service_error: "Unhandled Browserless response status: 408" (vast majority of cases) * service_error: :timeout (only a few cases) The latter happens when we hit the Req receive_timeout (endpoint_timeout + 2s). I've seen Browserless not respect the timeout param from time to time, so it's better to keep the timeout logic "in-house" only. * make service_error into a map with code and extra * interpret temporary service errors ...but still consider them "unhandled" for telemetry, also notifying Sentry and logging the warning. * separate sentry messages (verification) * make Verification.ChecksTest more DRY * organize tests into describe blocks * test verification telemetry and logging * fix codespell * get rid of legacy verification * rename Checks.InstallationV2 -> Checks.VerifyInstallation * delete Live.Installation and rename Live.InstallationV2 -> Live.Installation * rename installationv2 (live) files as well * delete old change-domain routes Also rename current liveview modules and routes, removing the v2 suffix * rename domain_change_v2 files, removing v2 suffix * remove legacy JS verifier code Also fix dockerignore and elixir.yml referencing a wrong priv path * rename verification_v2_test -> verification_test * remove v2 prefix from logs and sentry messages * clean up duplicate external_sites_controller_test.exs tests * remove flag * fix typespec * pass timeout as query param to Browserless too * Fixup external sites controller test module (#5826) * fix test description * clean up detection sentry events + tests * improve naming --------- Co-authored-by: Artur Pata <artur.pata@gmail.com>
This commit is contained in:
parent
a83b4f3583
commit
7540511deb
|
|
@ -37,8 +37,8 @@ defmodule Plausible.InstallationSupport.Detection.Checks do
|
|||
)
|
||||
end
|
||||
|
||||
def telemetry_event_handled(), do: [:plausible, :detection, :handled]
|
||||
def telemetry_event_unhandled(), do: [:plausible, :detection, :unhandled]
|
||||
def telemetry_event_success(), do: [:plausible, :detection, :success]
|
||||
def telemetry_event_failure(), do: [:plausible, :detection, :failure]
|
||||
|
||||
def interpret_diagnostics(%State{
|
||||
diagnostics: diagnostics,
|
||||
|
|
@ -47,24 +47,36 @@ defmodule Plausible.InstallationSupport.Detection.Checks do
|
|||
}) do
|
||||
result = Detection.Diagnostics.interpret(diagnostics, url)
|
||||
|
||||
case result.data do
|
||||
%{unhandled: true} ->
|
||||
Sentry.capture_message("Unhandled case for detection",
|
||||
extra: %{
|
||||
message: inspect(diagnostics),
|
||||
url: url,
|
||||
hash: :erlang.phash2(diagnostics)
|
||||
}
|
||||
)
|
||||
{failed?, trigger_sentry?, msg} =
|
||||
case result do
|
||||
%{ok?: true} ->
|
||||
{false, false, nil}
|
||||
|
||||
Logger.warning(
|
||||
"[DETECTION] Unhandled case (data_domain='#{data_domain}'): #{inspect(diagnostics)}"
|
||||
)
|
||||
%{data: %{failure: :customer_website_issue}} ->
|
||||
{true, false, "Failed due to an issue with the customer website"}
|
||||
|
||||
:telemetry.execute(telemetry_event_unhandled(), %{})
|
||||
%{data: %{failure: :browserless_issue}} ->
|
||||
{true, true, "Failed due to a Browserless issue"}
|
||||
|
||||
_ ->
|
||||
:telemetry.execute(telemetry_event_handled(), %{})
|
||||
_unknown_failure ->
|
||||
{true, true, "Unknown failure"}
|
||||
end
|
||||
|
||||
if failed? do
|
||||
:telemetry.execute(telemetry_event_failure(), %{})
|
||||
Logger.warning("[DETECTION] #{msg} (data_domain='#{data_domain}'): #{inspect(diagnostics)}")
|
||||
else
|
||||
:telemetry.execute(telemetry_event_success(), %{})
|
||||
end
|
||||
|
||||
if trigger_sentry? do
|
||||
Sentry.capture_message("[DETECTION] #{msg}",
|
||||
extra: %{
|
||||
message: inspect(diagnostics),
|
||||
url: url,
|
||||
hash: :erlang.phash2(diagnostics)
|
||||
}
|
||||
)
|
||||
end
|
||||
|
||||
result
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
|
|||
} = diagnostics,
|
||||
_url
|
||||
) do
|
||||
get_result("gtm", diagnostics)
|
||||
success("gtm", diagnostics)
|
||||
end
|
||||
|
||||
def interpret(
|
||||
|
|
@ -34,7 +34,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
|
|||
} = diagnostics,
|
||||
_url
|
||||
) do
|
||||
get_result(
|
||||
success(
|
||||
"wordpress",
|
||||
diagnostics
|
||||
)
|
||||
|
|
@ -47,7 +47,7 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
|
|||
} = diagnostics,
|
||||
_url
|
||||
) do
|
||||
get_result("npm", diagnostics)
|
||||
success("npm", diagnostics)
|
||||
end
|
||||
|
||||
def interpret(
|
||||
|
|
@ -56,29 +56,46 @@ defmodule Plausible.InstallationSupport.Detection.Diagnostics do
|
|||
} = diagnostics,
|
||||
_url
|
||||
) do
|
||||
get_result(PlausibleWeb.Tracker.fallback_installation_type(), diagnostics)
|
||||
success(PlausibleWeb.Tracker.fallback_installation_type(), diagnostics)
|
||||
end
|
||||
|
||||
def interpret(%__MODULE__{service_error: %{code: code}}, _url)
|
||||
when code in [:domain_not_found, :invalid_url] do
|
||||
failure(:customer_website_issue)
|
||||
end
|
||||
|
||||
def interpret(%__MODULE__{service_error: %{code: code}}, _url)
|
||||
when code in [:bad_browserless_response, :browserless_timeout] do
|
||||
failure(:browserless_issue)
|
||||
end
|
||||
|
||||
def interpret(
|
||||
%__MODULE__{
|
||||
service_error: service_error
|
||||
},
|
||||
%__MODULE__{service_error: %{code: :browserless_client_error, extra: extra}},
|
||||
_url
|
||||
)
|
||||
when service_error in [:domain_not_found, :invalid_url],
|
||||
do: %Result{ok?: false, data: nil, errors: [Atom.to_string(service_error)]}
|
||||
) do
|
||||
cond do
|
||||
String.contains?(extra, "net::") ->
|
||||
failure(:customer_website_issue)
|
||||
|
||||
def interpret(%__MODULE__{} = _diagnostics, _url), do: unhandled_case()
|
||||
String.contains?(String.downcase(extra), "execution context") ->
|
||||
failure(:customer_website_issue)
|
||||
|
||||
defp unhandled_case() do
|
||||
true ->
|
||||
failure(:unknown_issue)
|
||||
end
|
||||
end
|
||||
|
||||
def interpret(%__MODULE__{} = _diagnostics, _url), do: failure(:unknown_issue)
|
||||
|
||||
defp failure(reason) do
|
||||
%Result{
|
||||
ok?: false,
|
||||
data: %{unhandled: true},
|
||||
errors: ["Unhandled detection case"]
|
||||
data: %{failure: reason},
|
||||
errors: [reason]
|
||||
}
|
||||
end
|
||||
|
||||
defp get_result(suggested_technology, diagnostics) do
|
||||
defp success(suggested_technology, diagnostics) do
|
||||
%Result{
|
||||
ok?: true,
|
||||
data: %{
|
||||
|
|
|
|||
|
|
@ -185,15 +185,15 @@ defmodule Plausible.PromEx.Plugins.PlausibleMetrics do
|
|||
on_ee(
|
||||
do:
|
||||
counter(
|
||||
metric_prefix ++ [:detection, :handled],
|
||||
event_name: InstallationSupport.Detection.Checks.telemetry_event_handled()
|
||||
metric_prefix ++ [:detection, :success],
|
||||
event_name: InstallationSupport.Detection.Checks.telemetry_event_success()
|
||||
)
|
||||
),
|
||||
on_ee(
|
||||
do:
|
||||
counter(
|
||||
metric_prefix ++ [:detection, :unhandled],
|
||||
event_name: InstallationSupport.Detection.Checks.telemetry_event_unhandled()
|
||||
metric_prefix ++ [:detection, :failure],
|
||||
event_name: InstallationSupport.Detection.Checks.telemetry_event_failure()
|
||||
)
|
||||
),
|
||||
on_ee(
|
||||
|
|
|
|||
|
|
@ -10,14 +10,17 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
|
|||
alias Plausible.InstallationSupport.Result
|
||||
use Plausible.Test.Support.DNS
|
||||
import Plug.Conn
|
||||
import ExUnit.CaptureLog
|
||||
@moduletag :capture_log
|
||||
|
||||
@expected_domain "example.com"
|
||||
@working_url "https://#{@expected_domain}"
|
||||
|
||||
describe "running detection" do
|
||||
test "handles wordpress detection, retrying on 429" do
|
||||
expected_domain = "example.com"
|
||||
url_to_verify = nil
|
||||
test = self()
|
||||
stub_lookup_a_records(expected_domain)
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
get_context_from_body = fn conn ->
|
||||
{:ok, body, _conn} = Plug.Conn.read_body(conn)
|
||||
|
|
@ -25,7 +28,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
|
|||
end
|
||||
|
||||
detection_counter =
|
||||
stub_detection_result(fn conn, request_i ->
|
||||
stub_detection_result_with_counter(fn conn, request_i ->
|
||||
send(test, get_context_from_body.(conn))
|
||||
|
||||
case request_i do
|
||||
|
|
@ -60,7 +63,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
|
|||
suggested_technology: "wordpress"
|
||||
}
|
||||
} ==
|
||||
Checks.run(url_to_verify, expected_domain,
|
||||
Checks.run(url_to_verify, @expected_domain,
|
||||
report_to: nil,
|
||||
async?: false,
|
||||
slowdown: 0
|
||||
|
|
@ -76,7 +79,7 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
|
|||
"url" =>
|
||||
^any(
|
||||
:string,
|
||||
~r/https:\/\/#{expected_domain}\?plausible_verification=\d+$/
|
||||
~r/https:\/\/#{@expected_domain}\?plausible_verification=\d+$/
|
||||
),
|
||||
"userAgent" =>
|
||||
"Plausible Verification Agent - if abused, contact support@plausible.io"
|
||||
|
|
@ -86,7 +89,192 @@ defmodule Plausible.InstallationSupport.Detection.ChecksTest do
|
|||
end
|
||||
end
|
||||
|
||||
defp stub_detection_result(handler) do
|
||||
describe "observability" do
|
||||
setup %{test: test, test_pid: test_pid} do
|
||||
:telemetry.attach_many(
|
||||
"#{test}-telemetry-handler",
|
||||
[
|
||||
Checks.telemetry_event_success(),
|
||||
Checks.telemetry_event_failure()
|
||||
],
|
||||
fn event, %{}, _, _ ->
|
||||
send(test_pid, {:telemetry_event, event})
|
||||
end,
|
||||
%{}
|
||||
)
|
||||
end
|
||||
|
||||
test "successful detection -> no logs, telemetry :success" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub =
|
||||
json_response_detection_stub(%{
|
||||
"completed" => true,
|
||||
"v1Detected" => nil,
|
||||
"gtmLikely" => false,
|
||||
"npm" => false,
|
||||
"wordpressLikely" => true,
|
||||
"wordpressPlugin" => false
|
||||
})
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log == ""
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_success()
|
||||
end
|
||||
|
||||
test "domain not found -> customer website issue" do
|
||||
stub_lookup_a_records(@expected_domain, [])
|
||||
|
||||
detection_counter =
|
||||
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, fn _conn ->
|
||||
raise "This check should've been skipped"
|
||||
end)
|
||||
|
||||
state =
|
||||
Checks.run(@working_url, @expected_domain,
|
||||
report_to: nil,
|
||||
async?: false,
|
||||
slowdown: 0
|
||||
)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert detection_counter
|
||||
|
||||
assert log =~ "[DETECTION] Failed due to an issue with the customer website"
|
||||
assert log =~ "service_error: %{code: :domain_not_found}"
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
|
||||
for msg <- ["Execution context destroyed", "net::ERR_CONNECTION_REFUSED"] do
|
||||
test "failure due to a known :browserless_client_error (#{msg}) -> customer website issue" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub =
|
||||
json_response_detection_stub(%{
|
||||
"completed" => false,
|
||||
"error" => %{"message" => unquote(msg)}
|
||||
})
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log =~ "[DETECTION] Failed due to an issue with the customer website"
|
||||
assert log =~ "code: :browserless_client_error"
|
||||
assert log =~ ~s|extra: "#{unquote(msg)}"|
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
end
|
||||
|
||||
test "failure due to an unknown :browserless_client_error -> unknown failure" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub =
|
||||
json_response_detection_stub(%{
|
||||
"completed" => false,
|
||||
"error" => %{"message" => "something unexpected"}
|
||||
})
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log =~ "[DETECTION] Unknown failure"
|
||||
assert log =~ "code: :browserless_client_error"
|
||||
assert log =~ ~s|extra: "something unexpected"|
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
|
||||
test "failure hitting the catch-all interpret clause -> unknown failure" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub = fn conn ->
|
||||
Req.Test.transport_error(conn, :econnrefused)
|
||||
end
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log =~ "[DETECTION] Unknown failure"
|
||||
assert log =~ "code: :req_error"
|
||||
assert log =~ ~s|extra: :econnrefused|
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
|
||||
test "failure due to a flaky browserless issue -> browserless issue" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub = fn conn ->
|
||||
conn
|
||||
|> put_resp_content_type("text/html")
|
||||
|> send_resp(400, "some error message")
|
||||
end
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log =~ "[DETECTION] Failed due to a Browserless issue"
|
||||
assert log =~ "code: :bad_browserless_response"
|
||||
assert log =~ "extra: 400"
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
|
||||
test "failure due to a browserless timeout -> browserless issue" do
|
||||
stub_lookup_a_records(@expected_domain)
|
||||
|
||||
detection_stub = fn conn ->
|
||||
Req.Test.transport_error(conn, :timeout)
|
||||
end
|
||||
|
||||
state = run_checks(detection_stub)
|
||||
|
||||
log = capture_log(fn -> Checks.interpret_diagnostics(state) end)
|
||||
|
||||
assert log =~ "[DETECTION] Failed due to a Browserless issue"
|
||||
assert log =~ "code: :browserless_timeout"
|
||||
|
||||
assert_receive {:telemetry_event, telemetry_event}
|
||||
assert telemetry_event == Checks.telemetry_event_failure()
|
||||
end
|
||||
end
|
||||
|
||||
defp json_response_detection_stub(js_data) do
|
||||
fn conn ->
|
||||
conn
|
||||
|> put_resp_content_type("application/json")
|
||||
|> send_resp(200, Jason.encode!(%{"data" => js_data}))
|
||||
end
|
||||
end
|
||||
|
||||
defp run_checks(detection_stub) do
|
||||
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, detection_stub)
|
||||
|
||||
Checks.run(@working_url, @expected_domain,
|
||||
report_to: nil,
|
||||
async?: false,
|
||||
slowdown: 0
|
||||
)
|
||||
end
|
||||
|
||||
defp stub_detection_result_with_counter(handler) do
|
||||
counter = :atomics.new(1, [])
|
||||
|
||||
Req.Test.stub(Plausible.InstallationSupport.Checks.Detection, fn conn ->
|
||||
|
|
|
|||
Loading…
Reference in New Issue