reports: add ability to group reports by column.

Add ability to group reports by various columns. For example, you can see
the posts by the top 10 uploaders over time, or posts grouped by rating
over time.
This commit is contained in:
evazion
2022-10-22 03:17:30 -05:00
parent d1b6f30517
commit f73d2e3956
9 changed files with 176 additions and 74 deletions

View File

@@ -58,6 +58,7 @@ gem "debug"
gem "ffaker" gem "ffaker"
gem "composite_primary_keys" gem "composite_primary_keys"
gem "resolv" gem "resolv"
gem "rover-df"
group :development do group :development do
gem 'rubocop', require: false gem 'rubocop', require: false

View File

@@ -315,6 +315,7 @@ GEM
nokogiri (1.13.9) nokogiri (1.13.9)
mini_portile2 (~> 2.8.0) mini_portile2 (~> 2.8.0)
racc (~> 1.4) racc (~> 1.4)
numo-narray (0.9.2.1)
oauth2 (2.0.9) oauth2 (2.0.9)
faraday (>= 0.17.3, < 3.0) faraday (>= 0.17.3, < 3.0)
jwt (>= 1.0, < 3.0) jwt (>= 1.0, < 3.0)
@@ -407,6 +408,8 @@ GEM
reverse_markdown (2.1.1) reverse_markdown (2.1.1)
nokogiri nokogiri
rexml (3.2.5) rexml (3.2.5)
rover-df (0.3.4)
numo-narray (>= 0.9.1.9)
rubocop (1.36.0) rubocop (1.36.0)
json (~> 2.3) json (~> 2.3)
parallel (~> 1.10) parallel (~> 1.10)
@@ -582,6 +585,7 @@ DEPENDENCIES
redis redis
resolv resolv
responders responders
rover-df
rubocop rubocop
rubocop-rails rubocop-rails
ruby-vips ruby-vips

View File

@@ -3,11 +3,11 @@
class TimeSeriesComponent < ApplicationComponent class TimeSeriesComponent < ApplicationComponent
delegate :current_page_path, :search_params, to: :helpers delegate :current_page_path, :search_params, to: :helpers
attr_reader :results, :columns, :mode attr_reader :dataframe, :group, :mode
def initialize(results, columns, mode: :table) def initialize(dataframe, group: nil, mode: :table)
@results = results @dataframe = dataframe
@columns = columns @group = group
@mode = mode.to_sym @mode = mode.to_sym
end end
end end

View File

@@ -9,20 +9,18 @@
<% if mode == :table %> <% if mode == :table %>
<table class="striped autofit" width="100%"> <table class="striped autofit" width="100%">
<thead> <thead>
<th>Date</th> <% dataframe.types.keys.each do |column| %>
<%= tag.th(column.to_s.titleize, class: ("col-expand" if column == dataframe.types.keys.last)) %>
<% columns.each do |column| %>
<%= tag.th(column.to_s.capitalize, class: ("col-expand" if column == columns.last)) %>
<% end %> <% end %>
</thead> </thead>
<tbody> <tbody>
<% results.each do |row| %> <% dataframe.each_row do |row| %>
<tr> <tr>
<td><%= row["date"].to_date %></td> <% dataframe.types.keys.each do |column| %>
<td>
<% columns.each do |column| %> <%= row[column] %>
<td><%= row[column.to_s] %></td> </td>
<% end %> <% end %>
<tr> <tr>
<% end %> <% end %>
@@ -32,8 +30,8 @@
<div class="line-chart" style="width: 100%; height: 80vh;"></div> <div class="line-chart" style="width: 100%; height: 80vh;"></div>
<script type="text/javascript"> <script type="text/javascript">
var data = <%= raw results.to_a.to_json %>; var data = <%= raw dataframe.each_row.map(&:values).to_json %>;
var columns = <%= raw columns.to_json %>; var columns = <%= raw dataframe.types.keys.without("date").to_json %>;
var chart = new Danbooru.TimeSeriesComponent({ var chart = new Danbooru.TimeSeriesComponent({
container: $(".line-chart").get(0), container: $(".line-chart").get(0),
data: data, data: data,

View File

@@ -11,72 +11,88 @@ class ReportsController < ApplicationController
def show def show
@report = params[:id] @report = params[:id]
@mode = params.dig(:search, :mode) || "chart" @mode = params.dig(:search, :mode) || "chart"
@period = params.dig(:search, :period)&.downcase || "day"
@from = params.dig(:search, :from) || 1.month.ago
@to = params.dig(:search, :to) || Time.zone.now
@columns = params.dig(:search, :columns).to_s.split(/[[:space:],]/).map(&:to_sym)
@group = params.dig(:search, :group)&.downcase&.tr(" ", "_")
@group_limit = params.dig(:search, :group_limit) || 10
case @report case @report
when "posts" when "posts"
@model = Post @model = Post
@title = "Posts Report" @title = "Posts Report"
@columns = { posts: "COUNT(*)", uploaders: "COUNT(distinct uploader_id)" } @available_columns = { posts: "COUNT(*)", uploaders: "COUNT(distinct uploader_id)" }
@available_groups = %w[uploader approver rating is_deleted]
when "post_votes" when "post_votes"
@model = PostVote @model = PostVote
@title = "Post Votes Report" @title = "Post Votes Report"
@columns = { votes: "COUNT(*)", posts: "COUNT(distinct post_id)", voters: "COUNT(distinct user_id)" } @available_columns = { votes: "COUNT(*)", posts: "COUNT(distinct post_id)", voters: "COUNT(distinct user_id)" }
@available_groups = %w[]
when "pools" when "pools"
@model = Pool @model = Pool
@title = "Pools Report" @title = "Pools Report"
@columns = { series_pools: "COUNT(*) FILTER (WHERE category = 'series')", collection_pools: "COUNT(*) FILTER (WHERE category = 'collection')" } @available_columns = { pools: "COUNT(*)" }
@available_groups = %w[category is_deleted]
when "comments" when "comments"
@model = Comment @model = Comment
@title = "Comments Report" @title = "Comments Report"
@columns = { comments: "COUNT(*)", commenters: "COUNT(distinct creator_id)" } @available_columns = { comments: "COUNT(*)", commenters: "COUNT(distinct creator_id)" }
@available_groups = %w[creator do_not_bump_post is_deleted is_sticky]
when "comment_votes" when "comment_votes"
@model = CommentVote @model = CommentVote
@title = "Comment Votes Report" @title = "Comment Votes Report"
@columns = { votes: "COUNT(*)", comments: "COUNT(distinct comment_id)", voters: "COUNT(distinct user_id)" } @available_columns = { votes: "COUNT(*)", comments: "COUNT(distinct comment_id)", voters: "COUNT(distinct user_id)" }
@available_groups = %w[]
when "forum_posts" when "forum_posts"
@model = ForumPost @model = ForumPost
@title = "Forum Posts Report" @title = "Forum Posts Report"
@columns = { forum_posts: "COUNT(*)", posters: "COUNT(distinct creator_id)" } @available_columns = { forum_posts: "COUNT(*)", posters: "COUNT(distinct creator_id)" }
@available_groups = %w[creator is_deleted]
when "bulk_update_requests" when "bulk_update_requests"
@model = BulkUpdateRequest @model = BulkUpdateRequest
@title = "Bulk Update Requests Report" @title = "Bulk Update Requests Report"
@columns = { requests: "COUNT(*)", requestors: "COUNT(distinct user_id)" } @available_columns = { requests: "COUNT(*)", requestors: "COUNT(distinct user_id)" }
@available_groups = %w[user approver status]
when "tag_aliases" when "tag_aliases"
@model = TagAlias @model = TagAlias
@title = "Tag Aliases Report" @title = "Tag Aliases Report"
@columns = { aliases: "COUNT(*)" } @available_columns = { aliases: "COUNT(*)" }
@available_groups = %w[status approver]
when "tag_implications" when "tag_implications"
@model = TagImplication @model = TagImplication
@title = "Tag Implications Report" @title = "Tag Implications Report"
@columns = { aliases: "COUNT(*)" } @available_columns = { aliases: "COUNT(*)" }
@available_groups = %w[status approver]
when "artist_versions" when "artist_versions"
@model = ArtistVersion @model = ArtistVersion
@title = "Artist Edits Report" @title = "Artist Edits Report"
@columns = { artist_edits: "COUNT(*)", artists: "COUNT(distinct artist_id)", editors: "COUNT(distinct updater_id)" } @available_columns = { artist_edits: "COUNT(*)", artists: "COUNT(distinct artist_id)", editors: "COUNT(distinct updater_id)" }
@available_groups = %w[updater]
when "note_versions" when "note_versions"
@model = NoteVersion @model = NoteVersion
@title = "Note Edits Report" @title = "Note Edits Report"
@columns = { note_edits: "COUNT(*)", posts: "COUNT(distinct post_id)", editors: "COUNT(distinct updater_id)" } @available_columns = { note_edits: "COUNT(*)", posts: "COUNT(distinct post_id)", editors: "COUNT(distinct updater_id)" }
@available_groups = %w[updater]
when "wiki_page_versions" when "wiki_page_versions"
@model = WikiPageVersion @model = WikiPageVersion
@title = "Wiki Edits Report" @title = "Wiki Edits Report"
@columns = { wiki_edits: "COUNT(*)", editors: "COUNT(distinct updater_id)" } @available_columns = { wiki_edits: "COUNT(*)", editors: "COUNT(distinct updater_id)" }
@available_groups = %w[updater]
when "users" when "users"
@model = User @model = User
@title = "New Users Report" @title = "New Users Report"
@columns = { users: "COUNT(*)" } @available_columns = { users: "COUNT(*)" }
@available_groups = %w[level]
when "bans" when "bans"
@model = Ban @model = Ban
@title = "Bans Report" @title = "Bans Report"
@columns = { bans: "COUNT(*)", banners: "COUNT(DISTINCT banner_id)" } @available_columns = { bans: "COUNT(*)", banners: "COUNT(DISTINCT banner_id)" }
@available_groups = %w[banner duration]
else else
raise ActiveRecord::RecordNotFound raise ActiveRecord::RecordNotFound
end end
@period = params.dig(:search, :period)&.downcase || "day"
@from = params.dig(:search, :from) || 1.month.ago
@to = params.dig(:search, :to) || Time.zone.now
if CurrentUser.user.is_member? && CurrentUser.user.statement_timeout < 10_000 if CurrentUser.user.is_member? && CurrentUser.user.statement_timeout < 10_000
@statement_timeout = 10_000 @statement_timeout = 10_000
else else
@@ -84,9 +100,15 @@ class ReportsController < ApplicationController
end end
ApplicationRecord.set_timeout(@statement_timeout) do ApplicationRecord.set_timeout(@statement_timeout) do
@results = @model.search(params[:search], CurrentUser.user).timeseries(period: @period, from: @from, to: @to, columns: @columns) @group = nil unless @group&.in?(@available_groups)
@columns = @available_columns.slice(*@columns)
@columns = [@available_columns.first].to_h if @columns.blank?
@dataframe = @model.search(params[:search], CurrentUser.user).timeseries(period: @period, from: @from, to: @to, groups: [@group].compact_blank, group_limit: @group_limit, columns: @columns)
@dataframe["date"] = @dataframe["date"].map(&:to_date)
@dataframe[@group] = @dataframe[@group].map(&:pretty_name) if @group.in?(%w[creator updater uploader banner approver user])
@dataframe = @dataframe.crosstab("date", @group) if @group
end end
respond_with(@results) respond_with(@dataframe)
end end
end end

View File

@@ -11,7 +11,7 @@ export default class TimeSeriesComponent {
this.options = { this.options = {
dataset: { dataset: {
dimensions: ["date", ...columns], dimensions: ["date", ...this.columns],
source: data, source: data,
}, },
tooltip: { tooltip: {
@@ -46,14 +46,15 @@ export default class TimeSeriesComponent {
containLabel: true containLabel: true
}, },
legend: { legend: {
data: columns.map(startCase), data: this.columns.map(startCase),
}, },
xAxis: { type: "time" }, xAxis: { type: "time" },
yAxis: columns.map(name => ({ type: "value" })), yAxis: this.columns.map(name => ({ type: "value" })),
series: columns.map(name => ({ series: this.columns.map(name => ({
name: startCase(name), name: startCase(name),
type: "line", type: "line",
areaStyle: {}, areaStyle: {},
stack: "all",
emphasis: { emphasis: {
focus: "series" focus: "series"
}, },

View File

@@ -3,60 +3,88 @@
module Aggregatable module Aggregatable
extend ActiveSupport::Concern extend ActiveSupport::Concern
def timeseries(period: "day", date_column: :created_at, from: first[date_column], to: Time.now.utc, columns: { count: "COUNT(*)" }) def timeseries(period: "day", date_column: :created_at, from: first[date_column], to: Time.now.utc, groups: [], group_limit: 10, columns: { count: "COUNT(*)" })
raise ArgumentError, "invalid period: #{period}" if !period.in?(%w[second minute hour day week month quarter year]) raise ArgumentError, "invalid period: #{period}" if !period.in?(%w[second minute hour day week month quarter year])
raise ArgumentError if all.group_values.present?
from = from.to_date from = from.to_date
to = to.to_date to = to.to_date
# SELECT group_associations = groups.map { |name| reflections[name.to_s] }.compact_blank
# date_trunc('day', posts.created_at) AS date group_fields = groups.map { |name| reflections[name.to_s]&.foreign_key || name }
# COUNT(*) AS count
# FROM posts # SELECT date_trunc('day', posts.created_at) AS date FROM posts WHERE created_at BETWEEN from AND to GROUP BY date
# WHERE posts.created_at BETWEEN from AND to
# GROUP BY date
subquery = select(date_trunc(period, date_column).as("date")).where(date_column => (from..to)).group("date").reorder(nil) subquery = select(date_trunc(period, date_column).as("date")).where(date_column => (from..to)).group("date").reorder(nil)
group_fields.each do |name|
# SELECT date_trunc('day', posts.created_at) AS date, uploader_id FROM posts WHERE created_at BETWEEN from AND to GROUP BY date, uploader_id
subquery = subquery.select(name).group(name)
end
columns.each do |name, sql| columns.each do |name, sql|
# SELECT COUNT(*) AS count # SELECT date_trunc('day', posts.created_at) AS date, uploader_id, COUNT(*) AS count FROM posts WHERE created_at BETWEEN from AND to GROUP BY date, uploader_id
subquery = subquery.select(Arel.sql(sql).as(name.to_s).to_sql) subquery = subquery.select(Arel.sql(sql).as(name.to_s).to_sql)
end end
# SELECT date_trunc('day', dates) AS date FROM generate_series('2022-01-01', '2022-02-15', '1 day'::interval) AS dates # SELECT date_trunc('day', dates) AS date FROM generate_series(from, to, '1 day'::interval) AS dates
dates = "SELECT #{date_trunc(period, Arel.sql("dates")).to_sql} AS date FROM #{generate_timeseries(from, to, period).to_sql} AS dates" dates = "SELECT #{date_trunc(period, Arel.sql("dates")).to_sql} AS date FROM #{generate_timeseries(from, to, period).to_sql} AS dates"
# SELECT # SELECT dates.date FROM (SELECT date_trunc('day', dates) AS date FROM generate_series(from, to, '1 day'::interval) AS dates) AS dates
# date_trunc('day', dates.date) AS date, query = unscoped.select("dates.date").from("(#{dates}) AS dates")
# COALESCE(subquery.count, 0) AS count
# FROM ( group_fields.each do |field|
# SELECT date_trunc('day', dates) AS date # CROSS JOIN (SELECT uploader_id FROM posts WHERE created_at BETWEEN from AND to AND uploader_id IS NOT NULL GROUP BY uploader_id ORDER BY COUNT(*) DESC LIMIT 10) AS uploader_ids.uploader_id
# FROM generate_series(from, to, '1 day'::interval) AS dates join = select(field).where(date_column => (from..to)).where.not(field => nil).group(field).reorder(Arel.sql("COUNT(*) DESC")).limit(group_limit)
# ) AS dates
# LEFT OUTER JOIN ( # SELECT dates.date, uploader_ids.uploader_id
# SELECT # FROM (SELECT date_trunc('day', dates) AS date FROM generate_series('2022-01-01', '2022-02-15', '1 day'::interval) AS dates) AS dates
# date_trunc('day', posts.created_at) AS date, # CROSS JOIN (SELECT uploader_id FROM posts WHERE created_at BETWEEN from AND to GROUP BY uploader_ids ORDER BY COUNT(*) DESC LIMIT 10) AS uploader_ids.uploader_id
# COUNT(*) AS count query = query.select("#{connection.quote_table_name(field.to_s.pluralize)}.#{connection.quote_column_name(field)}")
# FROM posts query = query.joins("CROSS JOIN (#{join.to_sql}) AS #{connection.quote_column_name(field.to_s.pluralize)}")
# WHERE posts.created_at BETWEEN from AND to end
# GROUP BY date
# ) AS subquery # on_clause = "subquery.date = dates.date AND subquery.uploader_id = uploader_ids.uploader_id"
# ORDER BY date DESC on_clause = ["date", *group_fields].map { |group| "subquery.#{connection.quote_column_name(group)} = #{connection.quote_table_name(group.to_s.pluralize)}.#{connection.quote_column_name(group)}" }.join(" AND ")
query = query = query.joins("LEFT OUTER JOIN (#{subquery.to_sql}) AS subquery ON #{on_clause}")
unscoped. query = query.reorder("date DESC")
select(date_trunc(period, Arel.sql("dates.date")).as("date")).
from("(#{dates}) AS dates").
joins("LEFT OUTER JOIN (#{subquery.to_sql}) AS subquery ON subquery.date = dates.date").
order("date DESC")
columns.each do |name, sql| columns.each do |name, sql|
# SELECT COALESCE(subquery.count, 0) AS count # SELECT dates.date, uploader_ids.uploader_id, COALESCE(subquery.count, 0) AS count FROM ...
query = query.select(coalesce(Arel.sql("subquery.#{connection.quote_column_name(name)}"), 0).as(name.to_s)) query = query.select(coalesce(Arel.sql("subquery.#{connection.quote_column_name(name)}"), 0).as(name.to_s))
end end
query.select_all # query =
# SELECT
# dates.date,
# uploader_ids.uploader_id,
# COALESCE(subquery.count, 0) AS count
# FROM (
# SELECT date_trunc('day', dates) AS date FROM generate_series(from, to, '1 day'::interval) AS dates
# ) AS dates
# CROSS JOIN (
# SELECT uploader_id FROM posts WHERE created_at BETWEEN from AND to AND uploader_id IS NOT NULL GROUP BY uploader_id ORDER BY COUNT(*) DESC LIMIT 10
# ) AS uploader_ids.uploader_id
# LEFT OUTER JOIN (
# SELECT
# date_trunc('day', posts.created_at) AS date,
# uploader_id,
# COUNT(*) AS count
# FROM posts
# WHERE created_at BETWEEN from AND to
# GROUP BY date, uploader_id
# ) subquery ON subquery.date = dates.date AND subquery.uploader_id = uploader_ids.uploader_id
# ORDER BY date DESC
results = query.select_all
types = results.columns.map { |column| [column, :object] }.to_h
dataframe = Danbooru::DataFrame.new(results.to_a, types: types)
dataframe = dataframe.preload_associations(group_associations)
dataframe
end end
def group_by_period(period = "day", column = :created_at) def group_by_period(period = "day", column = :created_at)
select(date_trunc(period, column).as("date")).group("date").order(Arel.sql("date DESC")) select(date_trunc(period, column).as("date")).group("date").reorder(Arel.sql("date DESC"))
end end
def select_all def select_all

View File

@@ -0,0 +1,47 @@
# A wrapper around Rover::DataFrame that adds some extra utility methods.
#
# @see https://github.com/ankane/rover
module Danbooru
class DataFrame
attr_reader :df
delegate :head, :shape, :types, :rename, :each_row, :[], :[]=, to: :df
def initialize(...)
@df = Rover::DataFrame.new(...)
end
# Replace ID columns with the actual object. For example, replace the `user_id` column with a `user` column containing User objects.
def preload_associations(associations)
associations.reduce(dup) do |table, association|
primary_key = association.association_primary_key
foreign_key = association.foreign_key
name = association.name.to_s
ids = table[foreign_key].to_a.uniq.compact_blank
records = association.klass.where(primary_key => ids).index_by(&primary_key.to_sym)
table.rename({ foreign_key => name })
table[name] = table[name].map { |id| records[id] }
table
end
end
def crosstab(index, pivot)
new_df = DataFrame.new(index => df[index].uniq)
df[pivot].uniq.to_a.each do |value|
columns = df.types.keys.without(index, pivot)
columns.each do |column|
name = columns.one? ? value.to_s : "#{value}_#{column}"
new_df[name] = df[df[pivot] == value][column]
end
end
new_df
end
def as_json(*options)
df.to_a
end
end
end

View File

@@ -13,10 +13,11 @@
<%= f.input :from, as: :date, html5: true, input_html: { value: params[:search][:from] || 1.month.ago.to_date } %> <%= f.input :from, as: :date, html5: true, input_html: { value: params[:search][:from] || 1.month.ago.to_date } %>
<%= f.input :to, as: :date, html5: true, input_html: { value: params[:search][:to] || Time.zone.now.to_date } %> <%= f.input :to, as: :date, html5: true, input_html: { value: params[:search][:to] || Time.zone.now.to_date } %>
<%= f.input :period, collection: %w[Day Week Month Year], selected: params[:search][:period] %> <%= f.input :period, collection: %w[Day Week Month Year], selected: params[:search][:period] %>
<%= f.input :group, label: "Group By", collection: @available_groups.map { |group| [group.titleize, group] }, include_blank: true, selected: params[:search][:group] if @available_groups.present? %>
<%= f.input :mode, as: :hidden, input_html: { value: params[:search][:mode] } %> <%= f.input :mode, as: :hidden, input_html: { value: params[:search][:mode] } %>
<%= f.submit "Search" %> <%= f.submit "Search" %>
<% end %> <% end %>
<%= render TimeSeriesComponent.new(@results, @columns.keys, mode: @mode) %> <%= render TimeSeriesComponent.new(@dataframe, mode: @mode) %>
</div> </div>
</div> </div>