analytics/lib/plausible/stats/sql/where_builder.ex

406 lines
12 KiB
Elixir

defmodule Plausible.Stats.SQL.WhereBuilder do
@moduledoc """
A module for building am ecto where clause of a query out of a query.
"""
import Ecto.Query
import Plausible.Stats.Time, only: [utc_boundaries: 1]
import Plausible.Stats.Filters.Utils, only: [page_regex: 1]
use Plausible.Stats.SQL.Fragments
require Logger
@sessions_only_visit_fields [
:entry_page,
:exit_page,
:entry_page_hostname,
:exit_page_hostname
]
@doc "Builds WHERE clause for a given Query against sessions or events table"
def build(table, query) do
base_condition = filter_site_time_range(table, query)
query.filters
|> Enum.map(&add_filter(table, query, &1))
|> Enum.reduce(base_condition, fn condition, acc -> dynamic([], ^acc and ^condition) end)
end
@doc """
Builds WHERE clause condition based off of a filter and a custom column name
Used for special business logic cases
Accepts nil as the `filter` parameter, in which case the condition is a no-op (WHERE TRUE).
"""
def build_condition(db_field, filter) do
if filter do
filter_field(db_field, filter)
else
true
end
end
defp filter_site_time_range(table, query) do
dynamic([], ^filter_site_id(query) and ^filter_time_range(table, query))
end
defp filter_site_id(query) do
case query.consolidated_site_ids do
nil ->
dynamic([x], x.site_id == ^query.site_id)
[_ | _] = ids ->
dynamic([x], fragment("? in ?", x.site_id, ^ids))
end
end
defp filter_time_range(:events, query) do
{first_datetime, last_datetime} = utc_boundaries(query)
dynamic([e], e.timestamp >= ^first_datetime and e.timestamp <= ^last_datetime)
end
defp filter_time_range(:sessions, query) do
{first_datetime, last_datetime} = utc_boundaries(query)
# Counts each _active_ session in time range even if they started before
dynamic(
[s],
# Currently, the sessions table in ClickHouse only has `start` column
# in its primary key. This means that filtering by `timestamp` is not
# considered when estimating number of returned rows from index
# for sample factor calculation. The redundant lower bound `start` condition
# ensures the lower bound time filter is still present as primary key
# condition and the sample factor estimation has minimal skew.
#
# Without it, the sample factor would be greatly overestimated for large sites,
# as query would be estimated to return _all_ rows matching other conditions
# before `start == last_datetime`.
s.start >= ^NaiveDateTime.add(first_datetime, -7, :day) and
s.timestamp >= ^first_datetime and
s.start <= ^last_datetime
)
end
defp add_filter(table, query, [:ignore_in_totals_query, filter]) do
add_filter(table, query, filter)
end
defp add_filter(table, query, [:not, filter]) do
dynamic([e], not (^add_filter(table, query, filter)))
end
defp add_filter(table, query, [:and, filters]) do
filters
|> Enum.map(&add_filter(table, query, &1))
|> Enum.reduce(fn condition, acc -> dynamic([], ^acc and ^condition) end)
end
defp add_filter(table, query, [:or, filters]) do
filters
|> Enum.map(&add_filter(table, query, &1))
|> Enum.reduce(fn condition, acc -> dynamic([], ^acc or ^condition) end)
end
defp add_filter(_table, query, [:has_done, filter]) do
condition =
dynamic([], ^filter_site_time_range(:events, query) and ^add_filter(:events, query, filter))
dynamic(
[t],
t.session_id in subquery(from(e in "events_v2", where: ^condition, select: e.session_id))
)
end
defp add_filter(table, query, [:has_not_done, filter]) do
dynamic([], not (^add_filter(table, query, [:has_done, filter])))
end
defp add_filter(:events, _query, [:is, "event:name" | _rest] = filter) do
in_clause(col_value(:name), filter)
end
defp add_filter(:events, query, [_, "event:goal" | _rest] = filter) do
Plausible.Stats.Goals.add_filter(query, filter)
end
defp add_filter(:events, _query, [_, "event:page" | _rest] = filter) do
filter_field(:pathname, filter)
end
defp add_filter(:events, _query, [_, "event:hostname" | _rest] = filter) do
filter_field(:hostname, filter)
end
defp add_filter(:events, _query, [_, "event:props:" <> prop_name | _rest] = filter) do
filter_custom_prop(prop_name, :meta, filter)
end
defp add_filter(:events, _query, [_, "visit:entry_props:" <> _prop_name | _rest]) do
true
end
defp add_filter(
:events,
_query,
[_, "visit:" <> key | _rest] = filter
) do
# Filter events query with visit dimension if possible
field_name = db_field_name(key)
if Enum.member?(@sessions_only_visit_fields, field_name) do
true
else
filter_field(field_name, filter)
end
end
defp add_filter(:sessions, _query, [_, "visit:entry_props:" <> prop_name | _rest] = filter) do
filter_custom_prop(prop_name, :entry_meta, filter)
end
defp add_filter(:sessions, _query, [_, "visit:" <> key | _rest] = filter) do
filter_field(db_field_name(key), filter)
end
defp add_filter(:sessions, _query, [_, "event:" <> _ | _rest]) do
# Cannot apply sessions filters directly on session query where clause.
true
end
defp add_filter(table, _query, filter) do
Logger.notice("Unable to process garbage filter. No results are returned",
table: table,
filter: filter
)
false
end
defp filter_custom_prop(prop_name, column_name, [:is, _, clauses | _rest] = filter) do
none_value_included = Enum.member?(clauses, "(none)")
prop_value_expr = custom_prop_value(column_name, prop_name)
dynamic(
[t],
(has_key(t, column_name, ^prop_name) and ^in_clause(prop_value_expr, filter)) or
(^none_value_included and not has_key(t, column_name, ^prop_name))
)
end
defp filter_custom_prop(prop_name, column_name, [:is_not, _, clauses | _rest] = filter) do
none_value_included = Enum.member?(clauses, "(none)")
prop_value_expr = custom_prop_value(column_name, prop_name)
dynamic(
[t],
(has_key(t, column_name, ^prop_name) and
not (^in_clause(prop_value_expr, filter))) or
(^none_value_included and
has_key(t, column_name, ^prop_name) and
not (^in_clause(prop_value_expr, filter))) or
(not (^none_value_included) and not has_key(t, column_name, ^prop_name))
)
end
defp filter_custom_prop(prop_name, column_name, [:matches_wildcard, dimension, clauses | rest]) do
regexes = Enum.map(clauses, &page_regex/1)
filter_custom_prop(prop_name, column_name, [:matches, dimension, regexes | rest])
end
defp filter_custom_prop(prop_name, column_name, [
:matches_wildcard_not,
dimension,
clauses | rest
]) do
regexes = Enum.map(clauses, &page_regex/1)
filter_custom_prop(prop_name, column_name, [:matches_not, dimension, regexes | rest])
end
defp filter_custom_prop(prop_name, column_name, [:matches, _dimension, clauses | _rest]) do
dynamic(
[t],
has_key(t, column_name, ^prop_name) and
fragment(
"arrayExists(k -> match(?, k), ?)",
get_by_key(t, column_name, ^prop_name),
^clauses
)
)
end
defp filter_custom_prop(prop_name, column_name, [:matches_not, _dimension, clauses | _rest]) do
dynamic(
[t],
has_key(t, column_name, ^prop_name) and
fragment(
"not(arrayExists(k -> match(?, k), ?))",
get_by_key(t, column_name, ^prop_name),
^clauses
)
)
end
defp filter_custom_prop(prop_name, column_name, [:contains | _rest] = filter) do
dynamic(
[t],
has_key(t, column_name, ^prop_name) and
^contains_clause(custom_prop_value(column_name, prop_name), filter)
)
end
defp filter_custom_prop(prop_name, column_name, [:contains_not | _] = filter) do
dynamic(
[t],
has_key(t, column_name, ^prop_name) and
not (^contains_clause(custom_prop_value(column_name, prop_name), filter))
)
end
defp filter_field(db_field, [:matches_wildcard, _dimension, glob_exprs | _rest]) do
page_regexes = Enum.map(glob_exprs, &page_regex/1)
dynamic(
[x],
fragment("multiMatchAny(?, ?)", type(field(x, ^db_field), :string), ^page_regexes)
)
end
defp filter_field(db_field, [:matches_wildcard_not | rest]) do
dynamic([], not (^filter_field(db_field, [:matches_wildcard | rest])))
end
defp filter_field(db_field, [:contains | _rest] = filter) do
if no_ref_field?(db_field) do
contains_clause_no_ref(col_value_string(db_field), filter)
else
contains_clause(col_value_string(db_field), filter)
end
end
defp filter_field(db_field, [:contains_not | rest]) do
dynamic([], not (^filter_field(db_field, [:contains | rest])))
end
defp filter_field(db_field, [:matches, _dimension, clauses | _rest]) do
dynamic(
[x],
fragment("multiMatchAny(?, ?)", type(field(x, ^db_field), :string), ^clauses)
)
end
defp filter_field(db_field, [:matches_not | rest]) do
dynamic([], not (^filter_field(db_field, [:matches | rest])))
end
defp filter_field(db_field, [:is, _dimension, clauses | _rest] = filter) do
list = clauses |> Enum.map(&db_field_val(db_field, &1))
in_clause(col_value(db_field), filter, list)
end
defp filter_field(db_field, [:is_not | rest]) do
dynamic([], not (^filter_field(db_field, [:is | rest])))
end
@no_ref "Direct / None"
@not_set "(not set)"
defp db_field_name("channel"), do: :acquisition_channel
defp db_field_name(name), do: String.to_existing_atom(name)
defp no_ref_field?(:source), do: true
defp no_ref_field?(:referrer), do: true
defp no_ref_field?(:utm_medium), do: true
defp no_ref_field?(:utm_source), do: true
defp no_ref_field?(:utm_campaign), do: true
defp no_ref_field?(:utm_content), do: true
defp no_ref_field?(:utm_term), do: true
defp no_ref_field?(_), do: false
defp db_field_val(:source, @no_ref), do: ""
defp db_field_val(:referrer, @no_ref), do: ""
defp db_field_val(:utm_medium, @no_ref), do: ""
defp db_field_val(:utm_source, @no_ref), do: ""
defp db_field_val(:utm_campaign, @no_ref), do: ""
defp db_field_val(:utm_content, @no_ref), do: ""
defp db_field_val(:utm_term, @no_ref), do: ""
defp db_field_val(_, @not_set), do: ""
defp db_field_val(_, val), do: val
defp col_value(column_name) do
dynamic([t], field(t, ^column_name))
end
# Needed for string functions to work properly
defp col_value_string(column_name) do
dynamic([t], type(field(t, ^column_name), :string))
end
defp custom_prop_value(column_name, prop_name) do
dynamic([t], get_by_key(t, column_name, ^prop_name))
end
defp in_clause(value_expression, [_, _, clauses | _] = filter, values \\ nil) do
values = values || clauses
if case_sensitive?(filter) do
dynamic([t], ^value_expression in ^values)
else
values = values |> Enum.map(&String.downcase/1)
dynamic([t], fragment("lower(?)", ^value_expression) in ^values)
end
end
defp contains_clause(value_expression, [_, _, clauses | _] = filter) do
if case_sensitive?(filter) do
dynamic(
[x],
fragment("multiSearchAny(?, ?)", ^value_expression, ^clauses)
)
else
dynamic(
[x],
fragment("multiSearchAnyCaseInsensitive(?, ?)", ^value_expression, ^clauses)
)
end
end
defp contains_clause_no_ref(value_expression, [_, _, clauses | _] = filter) do
case_sensitive? = case_sensitive?(filter)
expression =
if case_sensitive? do
dynamic(
[x],
fragment("multiSearchAny(?, ?)", ^value_expression, ^clauses)
)
else
dynamic(
[x],
fragment("multiSearchAnyCaseInsensitive(?, ?)", ^value_expression, ^clauses)
)
end
if Enum.any?(clauses, &matches_no_ref?(&1, case_sensitive?)) do
dynamic([x], ^expression or fragment("? = ?", ^value_expression, ""))
else
expression
end
end
defp case_sensitive?([_, _, _, %{case_sensitive: false}]), do: false
defp case_sensitive?(_), do: true
@no_ref_downcase String.downcase(@no_ref)
defp matches_no_ref?(input, false) do
String.contains?(@no_ref_downcase, String.downcase(input))
end
defp matches_no_ref?(input, true) do
String.contains?(@no_ref, input)
end
end