defmodule Plausible.Stats.QueryOptimizer do @moduledoc """ Methods to manipulate Query for business logic reasons before building an ecto query. """ use Plausible alias Plausible.Stats.{ DateTimeRange, Filters, Query, TableDecider, TimeOnPage, Util, Time, Legacy } @doc """ This module manipulates an existing query, updating it according to business logic. For example, it: 1. Figures out what the right granularity to group by time is 2. Adds a missing order_by clause to a query 3. Updating "time" dimension in order_by to the right granularity 4. Updates event:hostname filters to also apply on visit level for sane results. 5. Removes revenue metrics from dashboard queries if not requested, present or unavailable for the site. 6. Trims the date range to the current time if query.include.trim_relative_date_range is true. 7. Sets the join_type for the query based on the query. """ def optimize(query) do Enum.reduce(pipeline(), query, fn step, acc -> step.(acc) end) end @doc """ Splits a query into event and sessions subcomponents as not all metrics can be queried from a single table. event:page dimension is treated in a special way, doing a breakdown of visit:entry_page for sessions. """ def split(query) do query.metrics |> Util.maybe_add_visitors_metric() |> TableDecider.partition_metrics(query) |> Enum.map(fn {table_type, metrics} -> build_split_query(table_type, metrics, query) end) end defp pipeline() do [ &update_group_by_time/1, &add_missing_order_by/1, &update_time_in_order_by/1, &extend_hostname_filters_to_visit/1, &remove_revenue_metrics_if_unavailable/1, &set_time_on_page_data/1, &trim_relative_date_range/1, &set_sql_join_type/1 ] end defp add_missing_order_by(%Query{order_by: nil} = query) do order_by = case time_dimension(query) do nil -> [{hd(query.metrics), :desc}] time_dimension -> [{time_dimension, :asc}, {hd(query.metrics), :desc}] end %Query{query | order_by: order_by} end defp add_missing_order_by(query), do: query defp update_group_by_time( %Query{ utc_time_range: %DateTimeRange{first: first, last: last} } = query ) do dimensions = query.dimensions |> Enum.map(fn "time" -> resolve_time_dimension(first, last) entry -> entry end) %Query{query | dimensions: dimensions} end defp update_group_by_time(query), do: query defp resolve_time_dimension(first, last) do cond do DateTime.diff(last, first, :hour) <= 48 -> "time:hour" DateTime.diff(last, first, :day) <= 40 -> "time:day" Plausible.Times.diff(last, first, :week) <= 52 -> "time:week" true -> "time:month" end end defp update_time_in_order_by(%Query{} = query) do order_by = query.order_by |> Enum.map(fn {"time", direction} -> {time_dimension(query), direction} entry -> entry end) %Query{query | order_by: order_by} end @dimensions_hostname_map %{ "visit:source" => "visit:entry_page_hostname", "visit:entry_page" => "visit:entry_page_hostname", "visit:utm_medium" => "visit:entry_page_hostname", "visit:utm_source" => "visit:entry_page_hostname", "visit:utm_campaign" => "visit:entry_page_hostname", "visit:utm_content" => "visit:entry_page_hostname", "visit:utm_term" => "visit:entry_page_hostname", "visit:referrer" => "visit:entry_page_hostname", "visit:exit_page" => "visit:exit_page_hostname" } # To avoid showing referrers across hostnames when event:hostname # filter is present for breakdowns, add entry/exit page hostname # filters defp extend_hostname_filters_to_visit(%Query{} = query) do # Note: Only works since event:hostname is only allowed as a top level filter hostname_filters = query.filters |> Enum.filter(fn [_operation, dimension | _rest] -> dimension == "event:hostname" end) if length(hostname_filters) > 0 do extra_filters = query.dimensions |> Enum.flat_map(&hostname_filters_for_dimension(&1, hostname_filters)) %Query{query | filters: query.filters ++ extra_filters} else query end end defp hostname_filters_for_dimension(dimension, hostname_filters) do if Map.has_key?(@dimensions_hostname_map, dimension) do dimension = Map.get(@dimensions_hostname_map, dimension) hostname_filters |> Enum.map(fn [operation, _dimension | rest] -> [operation, dimension | rest] end) else [] end end defp time_dimension(query) do Enum.find(query.dimensions, &Time.time_dimension?/1) end defp build_split_query(:events, metrics, query) do { :events, Query.set(query, metrics: metrics, include_imported: query.include_imported ) } end defp build_split_query(:sessions, metrics, query) do dimensions = query.dimensions |> Enum.map(fn "event:page" -> "visit:entry_page" dimension -> dimension end) filters = if "event:page" in query.dimensions do Filters.rename_dimensions_used_in_filter(query.filters, %{ "event:page" => "visit:entry_page" }) else query.filters end { :sessions, Query.set(query, filters: filters, metrics: metrics, dimensions: dimensions, include_imported: query.include_imported ) } end defp build_split_query(:sessions_smeared, metrics, query) do {_, query} = build_split_query(:sessions, metrics, query) {:sessions, Query.set(query, smear_session_metrics: true)} end on_ee do defp remove_revenue_metrics_if_unavailable(query) do if query.remove_unavailable_revenue_metrics and map_size(query.revenue_currencies) == 0 do Query.set(query, metrics: query.metrics -- Plausible.Stats.Goal.Revenue.revenue_metrics()) else query end end else defp remove_revenue_metrics_if_unavailable(query), do: query end defp set_time_on_page_data(query) do case {:time_on_page in query.metrics, query.time_on_page_data} do {true, %{new_metric_visible: true, cutoff_date: cutoff_date}} -> cutoff = cutoff_date |> TimeOnPage.cutoff_datetime(query.timezone) |> DateTime.shift_zone!("Etc/UTC") |> DateTime.truncate(:second) Query.set( query, time_on_page_data: Map.merge(query.time_on_page_data, %{ include_new_metric: DateTime.before?(cutoff, query.utc_time_range.last), include_legacy_metric: DateTime.after?(cutoff, query.utc_time_range.first) and Legacy.TimeOnPage.can_merge_legacy_time_on_page?(query), cutoff: if(DateTime.after?(cutoff, query.utc_time_range.first), do: cutoff, else: nil) }) ) _ -> Query.set( query, time_on_page_data: Map.merge(query.time_on_page_data, %{ include_new_metric: false, include_legacy_metric: true, cutoff: nil }) ) end end defp trim_relative_date_range(%Query{include: %{trim_relative_date_range: true}} = query) do # This is here to trim future bucket labels on the main graph if should_trim_date_range?(query) do trimmed_range = trim_date_range_to_now(query) %Query{query | utc_time_range: trimmed_range} else query end end defp trim_relative_date_range(query), do: query defp should_trim_date_range?(%Query{input_date_range: "month"} = query) do today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date() date_range = Query.date_range(query) current_month_start = Date.beginning_of_month(today) current_month_end = Date.end_of_month(today) date_range.first == current_month_start and date_range.last == current_month_end end defp should_trim_date_range?(%Query{input_date_range: "year"} = query) do today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date() date_range = Query.date_range(query) current_year_start = Date.new!(today.year, 1, 1) current_year_end = Date.new!(today.year, 12, 31) date_range.first == current_year_start and date_range.last == current_year_end end defp should_trim_date_range?(%Query{input_date_range: "day"} = query) do today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date() date_range = Query.date_range(query) is_nil(query.include.comparisons) and date_range.first == today and date_range.last == today end defp should_trim_date_range?(_query), do: false defp trim_date_range_to_now(query) do if query.input_date_range == "day" do time_range = query.utc_time_range |> DateTimeRange.to_timezone(query.timezone) current_hour = query.now |> DateTime.shift_zone!(query.timezone) |> Map.merge(%{minute: 59, second: 59, millisecond: 999}) time_range.first |> DateTimeRange.new!(current_hour) |> DateTimeRange.to_timezone("Etc/UTC") else date_range = Query.date_range(query) today = query.now |> DateTime.shift_zone!(query.timezone) |> DateTime.to_date() trimmed_to_date = Enum.min([date_range.last, today], Date) date_range.first |> DateTimeRange.new!(trimmed_to_date, query.timezone) |> DateTimeRange.to_timezone("Etc/UTC") end end # Normally we can always LEFT JOIN as this is more performant and tables # are expected to contain the same dimensions. # The only exception is using the "time:minute"/"time:hour" dimension where the sessions # subquery might return more rows than the events one. That's because we're # counting sessions in all time buckets they were active in even if no event # occurred during that particular bucket. defp set_sql_join_type(query) do if "time:minute" in query.dimensions or "time:hour" in query.dimensions do Query.set(query, sql_join_type: :full) else query end end end