analytics/lib/plausible/stats/query_optimizer.ex

333 lines
10 KiB
Elixir

defmodule Plausible.Stats.QueryOptimizer do
@moduledoc """
Methods to manipulate Query for business logic reasons before building an ecto query.
"""
use Plausible
alias Plausible.Stats.{
DateTimeRange,
Filters,
Query,
TableDecider,
TimeOnPage,
Util,
Time,
Legacy
}
@doc """
This module manipulates an existing query, updating it according to business logic.
For example, it:
1. Figures out what the right granularity to group by time is
2. Adds a missing order_by clause to a query
3. Updating "time" dimension in order_by to the right granularity
4. Updates event:hostname filters to also apply on visit level for sane results.
5. Removes revenue metrics from dashboard queries if not requested, present or unavailable for the site.
6. Trims the date range to the current time if query.include.trim_relative_date_range is true.
7. Sets the join_type for the query based on the query.
"""
def optimize(query) do
Enum.reduce(pipeline(), query, fn step, acc -> step.(acc) end)
end
@doc """
Splits a query into event and sessions subcomponents as not all metrics can be
queried from a single table.
event:page dimension is treated in a special way, doing a breakdown of visit:entry_page
for sessions.
"""
def split(query) do
query.metrics
|> Util.maybe_add_visitors_metric()
|> TableDecider.partition_metrics(query)
|> Enum.map(fn {table_type, metrics} ->
build_split_query(table_type, metrics, query)
end)
end
defp pipeline() do
[
&update_group_by_time/1,
&add_missing_order_by/1,
&update_time_in_order_by/1,
&extend_hostname_filters_to_visit/1,
&remove_revenue_metrics_if_unavailable/1,
&set_time_on_page_data/1,
&trim_relative_date_range/1,
&set_sql_join_type/1
]
end
defp add_missing_order_by(%Query{order_by: nil} = query) do
order_by =
case time_dimension(query) do
nil -> [{hd(query.metrics), :desc}]
time_dimension -> [{time_dimension, :asc}, {hd(query.metrics), :desc}]
end
%Query{query | order_by: order_by}
end
defp add_missing_order_by(query), do: query
defp update_group_by_time(
%Query{
utc_time_range: %DateTimeRange{first: first, last: last}
} = query
) do
dimensions =
query.dimensions
|> Enum.map(fn
"time" -> resolve_time_dimension(first, last)
entry -> entry
end)
%Query{query | dimensions: dimensions}
end
defp update_group_by_time(query), do: query
defp resolve_time_dimension(first, last) do
cond do
DateTime.diff(last, first, :hour) <= 48 -> "time:hour"
DateTime.diff(last, first, :day) <= 40 -> "time:day"
Plausible.Times.diff(last, first, :week) <= 52 -> "time:week"
true -> "time:month"
end
end
defp update_time_in_order_by(%Query{} = query) do
order_by =
query.order_by
|> Enum.map(fn
{"time", direction} -> {time_dimension(query), direction}
entry -> entry
end)
%Query{query | order_by: order_by}
end
@dimensions_hostname_map %{
"visit:source" => "visit:entry_page_hostname",
"visit:entry_page" => "visit:entry_page_hostname",
"visit:utm_medium" => "visit:entry_page_hostname",
"visit:utm_source" => "visit:entry_page_hostname",
"visit:utm_campaign" => "visit:entry_page_hostname",
"visit:utm_content" => "visit:entry_page_hostname",
"visit:utm_term" => "visit:entry_page_hostname",
"visit:referrer" => "visit:entry_page_hostname",
"visit:exit_page" => "visit:exit_page_hostname"
}
# To avoid showing referrers across hostnames when event:hostname
# filter is present for breakdowns, add entry/exit page hostname
# filters
defp extend_hostname_filters_to_visit(%Query{} = query) do
# Note: Only works since event:hostname is only allowed as a top level filter
hostname_filters =
query.filters
|> Enum.filter(fn [_operation, dimension | _rest] -> dimension == "event:hostname" end)
if length(hostname_filters) > 0 do
extra_filters =
query.dimensions
|> Enum.flat_map(&hostname_filters_for_dimension(&1, hostname_filters))
%Query{query | filters: query.filters ++ extra_filters}
else
query
end
end
defp hostname_filters_for_dimension(dimension, hostname_filters) do
if Map.has_key?(@dimensions_hostname_map, dimension) do
dimension = Map.get(@dimensions_hostname_map, dimension)
hostname_filters
|> Enum.map(fn [operation, _dimension | rest] -> [operation, dimension | rest] end)
else
[]
end
end
defp time_dimension(query) do
Enum.find(query.dimensions, &Time.time_dimension?/1)
end
defp build_split_query(:events, metrics, query) do
{
:events,
Query.set(query,
metrics: metrics,
include_imported: query.include_imported
)
}
end
defp build_split_query(:sessions, metrics, query) do
dimensions =
query.dimensions
|> Enum.map(fn
"event:page" -> "visit:entry_page"
dimension -> dimension
end)
filters =
if "event:page" in query.dimensions do
Filters.rename_dimensions_used_in_filter(query.filters, %{
"event:page" => "visit:entry_page"
})
else
query.filters
end
{
:sessions,
Query.set(query,
filters: filters,
metrics: metrics,
dimensions: dimensions,
include_imported: query.include_imported
)
}
end
defp build_split_query(:sessions_smeared, metrics, query) do
{_, query} = build_split_query(:sessions, metrics, query)
{:sessions, Query.set(query, smear_session_metrics: true)}
end
on_ee do
defp remove_revenue_metrics_if_unavailable(query) do
if query.remove_unavailable_revenue_metrics and map_size(query.revenue_currencies) == 0 do
Query.set(query, metrics: query.metrics -- Plausible.Stats.Goal.Revenue.revenue_metrics())
else
query
end
end
else
defp remove_revenue_metrics_if_unavailable(query), do: query
end
defp set_time_on_page_data(query) do
case {:time_on_page in query.metrics, query.time_on_page_data} do
{true, %{new_metric_visible: true, cutoff_date: cutoff_date}} ->
cutoff =
cutoff_date
|> TimeOnPage.cutoff_datetime(query.timezone)
|> DateTime.shift_zone!("Etc/UTC")
|> DateTime.truncate(:second)
Query.set(
query,
time_on_page_data:
Map.merge(query.time_on_page_data, %{
include_new_metric: DateTime.before?(cutoff, query.utc_time_range.last),
include_legacy_metric:
DateTime.after?(cutoff, query.utc_time_range.first) and
Legacy.TimeOnPage.can_merge_legacy_time_on_page?(query),
cutoff:
if(DateTime.after?(cutoff, query.utc_time_range.first), do: cutoff, else: nil)
})
)
_ ->
Query.set(
query,
time_on_page_data:
Map.merge(query.time_on_page_data, %{
include_new_metric: false,
include_legacy_metric: true,
cutoff: nil
})
)
end
end
defp trim_relative_date_range(%Query{include: %{trim_relative_date_range: true}} = query) do
# This is here to trim future bucket labels on the main graph
if should_trim_date_range?(query) do
trimmed_range = trim_date_range_to_now(query)
%Query{query | utc_time_range: trimmed_range}
else
query
end
end
defp trim_relative_date_range(query), do: query
defp should_trim_date_range?(%Query{input_date_range: :month} = query) do
today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date()
date_range = Query.date_range(query)
current_month_start = Date.beginning_of_month(today)
current_month_end = Date.end_of_month(today)
date_range.first == current_month_start and date_range.last == current_month_end
end
defp should_trim_date_range?(%Query{input_date_range: :year} = query) do
today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date()
date_range = Query.date_range(query)
current_year_start = Date.new!(today.year, 1, 1)
current_year_end = Date.new!(today.year, 12, 31)
date_range.first == current_year_start and date_range.last == current_year_end
end
defp should_trim_date_range?(%Query{input_date_range: :day} = query) do
today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date()
date_range = Query.date_range(query)
is_nil(query.include.comparisons) and date_range.first == today and date_range.last == today
end
defp should_trim_date_range?(_query), do: false
defp trim_date_range_to_now(query) do
if query.input_date_range == :day do
time_range = query.utc_time_range |> DateTimeRange.to_timezone(query.timezone)
current_hour =
query.now
|> DateTime.shift_zone!(query.timezone)
|> Map.merge(%{minute: 59, second: 59, millisecond: 999})
time_range.first
|> DateTimeRange.new!(current_hour)
|> DateTimeRange.to_timezone("Etc/UTC")
else
date_range = Query.date_range(query)
today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date()
trimmed_to_date =
Enum.min([date_range.last, today], Date)
date_range.first
|> DateTimeRange.new!(trimmed_to_date, query.timezone)
|> DateTimeRange.to_timezone("Etc/UTC")
end
end
# Normally we can always LEFT JOIN as this is more performant and tables
# are expected to contain the same dimensions.
# The only exception is using the "time:minute"/"time:hour" dimension where the sessions
# subquery might return more rows than the events one. That's because we're
# counting sessions in all time buckets they were active in even if no event
# occurred during that particular bucket.
defp set_sql_join_type(query) do
if "time:minute" in query.dimensions or "time:hour" in query.dimensions do
Query.set(query, sql_join_type: :full)
else
query
end
end
end