Export public database dumps to BigQuery.
* Export daily public database dumps to BigQuery and Google Cloud Storage. * Only data visible to anonymous users is exported. Some tables have null or missing fields because of this. * The bans table is excluded because some bans have an expires_at timestamp set beyond year 9999, which BigQuery doesn't support. * The favorites table is excluded because it's too slow to dump (it doesn't have an id index, which is needed by find_each). * Version tables are excluded because dumping them every day is inefficient, streaming insertions should be used instead. Links: * https://console.cloud.google.com/bigquery?project=danbooru1 * https://console.cloud.google.com/storage/browser/danbooru_public * https://storage.googleapis.com/danbooru_public/data/posts.json
This commit is contained in:
1
.github/workflows/test.yaml
vendored
1
.github/workflows/test.yaml
vendored
@@ -49,6 +49,7 @@ jobs:
|
|||||||
DANBOORU_RAKISMET_KEY: ${{ secrets.DANBOORU_RAKISMET_KEY }}
|
DANBOORU_RAKISMET_KEY: ${{ secrets.DANBOORU_RAKISMET_KEY }}
|
||||||
DANBOORU_RAKISMET_URL: ${{ secrets.DANBOORU_RAKISMET_URL }}
|
DANBOORU_RAKISMET_URL: ${{ secrets.DANBOORU_RAKISMET_URL }}
|
||||||
DANBOORU_IP_REGISTRY_API_KEY: ${{ secrets.DANBOORU_IP_REGISTRY_API_KEY }}
|
DANBOORU_IP_REGISTRY_API_KEY: ${{ secrets.DANBOORU_IP_REGISTRY_API_KEY }}
|
||||||
|
DANBOORU_GOOGLE_CLOUD_CREDENTIALS: ${{ secrets.DANBOORU_GOOGLE_CLOUD_CREDENTIALS }}
|
||||||
DANBOORU_STRIPE_SECRET_KEY: ${{ secrets.DANBOORU_STRIPE_SECRET_KEY }}
|
DANBOORU_STRIPE_SECRET_KEY: ${{ secrets.DANBOORU_STRIPE_SECRET_KEY }}
|
||||||
DANBOORU_STRIPE_PUBLISHABLE_KEY: ${{ secrets.DANBOORU_STRIPE_PUBLISHABLE_KEY }}
|
DANBOORU_STRIPE_PUBLISHABLE_KEY: ${{ secrets.DANBOORU_STRIPE_PUBLISHABLE_KEY }}
|
||||||
DANBOORU_STRIPE_WEBHOOK_SECRET: ${{ secrets.DANBOORU_STRIPE_WEBHOOK_SECRET }}
|
DANBOORU_STRIPE_WEBHOOK_SECRET: ${{ secrets.DANBOORU_STRIPE_WEBHOOK_SECRET }}
|
||||||
|
|||||||
2
Gemfile
2
Gemfile
@@ -47,6 +47,8 @@ gem 'nokogiri'
|
|||||||
gem 'view_component', require: 'view_component/engine'
|
gem 'view_component', require: 'view_component/engine'
|
||||||
gem 'tzinfo-data'
|
gem 'tzinfo-data'
|
||||||
gem 'hsluv'
|
gem 'hsluv'
|
||||||
|
gem 'google-cloud-bigquery', require: "google/cloud/bigquery"
|
||||||
|
gem 'google-cloud-storage', require: "google/cloud/storage"
|
||||||
|
|
||||||
group :production, :staging do
|
group :production, :staging do
|
||||||
gem 'unicorn', :platforms => :ruby
|
gem 'unicorn', :platforms => :ruby
|
||||||
|
|||||||
63
Gemfile.lock
63
Gemfile.lock
@@ -161,12 +161,16 @@ GEM
|
|||||||
daemons (1.3.1)
|
daemons (1.3.1)
|
||||||
dante (0.2.0)
|
dante (0.2.0)
|
||||||
debug_inspector (1.0.0)
|
debug_inspector (1.0.0)
|
||||||
|
declarative (0.0.20)
|
||||||
|
declarative-option (0.1.0)
|
||||||
delayed_job (4.1.9)
|
delayed_job (4.1.9)
|
||||||
activesupport (>= 3.0, < 6.2)
|
activesupport (>= 3.0, < 6.2)
|
||||||
delayed_job_active_record (4.1.5)
|
delayed_job_active_record (4.1.5)
|
||||||
activerecord (>= 3.0, < 6.2)
|
activerecord (>= 3.0, < 6.2)
|
||||||
delayed_job (>= 3.0, < 5)
|
delayed_job (>= 3.0, < 5)
|
||||||
diff-lcs (1.4.4)
|
diff-lcs (1.4.4)
|
||||||
|
digest-crc (0.6.3)
|
||||||
|
rake (>= 12.0.0, < 14.0.0)
|
||||||
docile (1.3.5)
|
docile (1.3.5)
|
||||||
domain_name (0.5.20190701)
|
domain_name (0.5.20190701)
|
||||||
unf (>= 0.0.5, < 1.0.0)
|
unf (>= 0.0.5, < 1.0.0)
|
||||||
@@ -192,6 +196,49 @@ GEM
|
|||||||
ffi (~> 1.0)
|
ffi (~> 1.0)
|
||||||
globalid (0.4.2)
|
globalid (0.4.2)
|
||||||
activesupport (>= 4.2.0)
|
activesupport (>= 4.2.0)
|
||||||
|
google-apis-bigquery_v2 (0.6.0)
|
||||||
|
google-apis-core (~> 0.1)
|
||||||
|
google-apis-core (0.3.0)
|
||||||
|
addressable (~> 2.5, >= 2.5.1)
|
||||||
|
googleauth (~> 0.14)
|
||||||
|
httpclient (>= 2.8.1, < 3.0)
|
||||||
|
mini_mime (~> 1.0)
|
||||||
|
representable (~> 3.0)
|
||||||
|
retriable (>= 2.0, < 4.0)
|
||||||
|
rexml
|
||||||
|
signet (~> 0.14)
|
||||||
|
webrick
|
||||||
|
google-apis-iamcredentials_v1 (0.2.0)
|
||||||
|
google-apis-core (~> 0.1)
|
||||||
|
google-apis-storage_v1 (0.3.0)
|
||||||
|
google-apis-core (~> 0.1)
|
||||||
|
google-cloud-bigquery (1.28.0)
|
||||||
|
concurrent-ruby (~> 1.0)
|
||||||
|
google-apis-bigquery_v2 (~> 0.1)
|
||||||
|
google-cloud-core (~> 1.2)
|
||||||
|
googleauth (~> 0.9)
|
||||||
|
mini_mime (~> 1.0)
|
||||||
|
google-cloud-core (1.5.0)
|
||||||
|
google-cloud-env (~> 1.0)
|
||||||
|
google-cloud-errors (~> 1.0)
|
||||||
|
google-cloud-env (1.5.0)
|
||||||
|
faraday (>= 0.17.3, < 2.0)
|
||||||
|
google-cloud-errors (1.0.1)
|
||||||
|
google-cloud-storage (1.30.0)
|
||||||
|
addressable (~> 2.5)
|
||||||
|
digest-crc (~> 0.4)
|
||||||
|
google-apis-iamcredentials_v1 (~> 0.1)
|
||||||
|
google-apis-storage_v1 (~> 0.1)
|
||||||
|
google-cloud-core (~> 1.2)
|
||||||
|
googleauth (~> 0.9)
|
||||||
|
mini_mime (~> 1.0)
|
||||||
|
googleauth (0.16.0)
|
||||||
|
faraday (>= 0.17.3, < 2.0)
|
||||||
|
jwt (>= 1.4, < 3.0)
|
||||||
|
memoist (~> 0.16)
|
||||||
|
multi_json (~> 1.11)
|
||||||
|
os (>= 0.9, < 2.0)
|
||||||
|
signet (~> 0.14)
|
||||||
hsluv (1.0.1)
|
hsluv (1.0.1)
|
||||||
http (4.4.1)
|
http (4.4.1)
|
||||||
addressable (~> 2.3)
|
addressable (~> 2.3)
|
||||||
@@ -201,6 +248,7 @@ GEM
|
|||||||
http-form_data (2.3.0)
|
http-form_data (2.3.0)
|
||||||
http-parser (1.2.3)
|
http-parser (1.2.3)
|
||||||
ffi-compiler (>= 1.0, < 2.0)
|
ffi-compiler (>= 1.0, < 2.0)
|
||||||
|
httpclient (2.8.3)
|
||||||
i18n (1.8.9)
|
i18n (1.8.9)
|
||||||
concurrent-ruby (~> 1.0)
|
concurrent-ruby (~> 1.0)
|
||||||
ipaddress_2 (0.13.0)
|
ipaddress_2 (0.13.0)
|
||||||
@@ -257,6 +305,7 @@ GEM
|
|||||||
multi_json (~> 1.3)
|
multi_json (~> 1.3)
|
||||||
multi_xml (~> 0.5)
|
multi_xml (~> 0.5)
|
||||||
rack (>= 1.2, < 3)
|
rack (>= 1.2, < 3)
|
||||||
|
os (1.1.1)
|
||||||
parallel (1.20.1)
|
parallel (1.20.1)
|
||||||
parser (3.0.0.0)
|
parser (3.0.0.0)
|
||||||
ast (~> 2.4.1)
|
ast (~> 2.4.1)
|
||||||
@@ -321,9 +370,14 @@ GEM
|
|||||||
json
|
json
|
||||||
redis (4.2.5)
|
redis (4.2.5)
|
||||||
regexp_parser (2.1.1)
|
regexp_parser (2.1.1)
|
||||||
|
representable (3.0.4)
|
||||||
|
declarative (< 0.1.0)
|
||||||
|
declarative-option (< 0.2.0)
|
||||||
|
uber (< 0.2.0)
|
||||||
responders (3.0.1)
|
responders (3.0.1)
|
||||||
actionpack (>= 5.0)
|
actionpack (>= 5.0)
|
||||||
railties (>= 5.0)
|
railties (>= 5.0)
|
||||||
|
retriable (3.1.2)
|
||||||
rexml (3.2.4)
|
rexml (3.2.4)
|
||||||
rubocop (1.11.0)
|
rubocop (1.11.0)
|
||||||
parallel (~> 1.10)
|
parallel (~> 1.10)
|
||||||
@@ -359,6 +413,11 @@ GEM
|
|||||||
shoulda-context (2.0.0)
|
shoulda-context (2.0.0)
|
||||||
shoulda-matchers (4.5.1)
|
shoulda-matchers (4.5.1)
|
||||||
activesupport (>= 4.2.0)
|
activesupport (>= 4.2.0)
|
||||||
|
signet (0.15.0)
|
||||||
|
addressable (~> 2.3)
|
||||||
|
faraday (>= 0.17.3, < 2.0)
|
||||||
|
jwt (>= 1.5, < 3.0)
|
||||||
|
multi_json (~> 1.10)
|
||||||
simple_form (5.1.0)
|
simple_form (5.1.0)
|
||||||
actionpack (>= 5.2)
|
actionpack (>= 5.2)
|
||||||
activemodel (>= 5.2)
|
activemodel (>= 5.2)
|
||||||
@@ -391,6 +450,7 @@ GEM
|
|||||||
concurrent-ruby (~> 1.0)
|
concurrent-ruby (~> 1.0)
|
||||||
tzinfo-data (1.2021.1)
|
tzinfo-data (1.2021.1)
|
||||||
tzinfo (>= 1.0.0)
|
tzinfo (>= 1.0.0)
|
||||||
|
uber (0.1.0)
|
||||||
unf (0.1.4)
|
unf (0.1.4)
|
||||||
unf_ext
|
unf_ext
|
||||||
unf_ext (0.0.7.7)
|
unf_ext (0.0.7.7)
|
||||||
@@ -408,6 +468,7 @@ GEM
|
|||||||
rack-proxy (>= 0.6.1)
|
rack-proxy (>= 0.6.1)
|
||||||
railties (>= 5.2)
|
railties (>= 5.2)
|
||||||
semantic_range (>= 2.3.0)
|
semantic_range (>= 2.3.0)
|
||||||
|
webrick (1.7.0)
|
||||||
websocket-driver (0.7.3)
|
websocket-driver (0.7.3)
|
||||||
websocket-extensions (>= 0.1.0)
|
websocket-extensions (>= 0.1.0)
|
||||||
websocket-extensions (0.1.5)
|
websocket-extensions (0.1.5)
|
||||||
@@ -447,6 +508,8 @@ DEPENDENCIES
|
|||||||
factory_bot
|
factory_bot
|
||||||
ffaker
|
ffaker
|
||||||
flamegraph
|
flamegraph
|
||||||
|
google-cloud-bigquery
|
||||||
|
google-cloud-storage
|
||||||
hsluv
|
hsluv
|
||||||
http
|
http
|
||||||
http-cookie!
|
http-cookie!
|
||||||
|
|||||||
7
app/jobs/bigquery_export_job.rb
Normal file
7
app/jobs/bigquery_export_job.rb
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
class BigqueryExportJob < ApplicationJob
|
||||||
|
retry_on Exception, attempts: 0
|
||||||
|
|
||||||
|
def perform(model:, **options)
|
||||||
|
BigqueryExportService.new(model, **options).export!
|
||||||
|
end
|
||||||
|
end
|
||||||
95
app/logical/bigquery_export_service.rb
Normal file
95
app/logical/bigquery_export_service.rb
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
# Export all public data in a model to BigQuery and to Google Cloud Storage.
|
||||||
|
|
||||||
|
class BigqueryExportService
|
||||||
|
extend Memoist
|
||||||
|
|
||||||
|
attr_reader :model, :dataset_name, :credentials
|
||||||
|
|
||||||
|
def initialize(model = nil, dataset_name: "danbooru_public", credentials: default_credentials)
|
||||||
|
@model = model
|
||||||
|
@dataset_name = dataset_name
|
||||||
|
@credentials = credentials
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.async_export_all!(**options)
|
||||||
|
models.each do |model|
|
||||||
|
BigqueryExportJob.perform_later(model: model, **options)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.models
|
||||||
|
Rails.application.eager_load!
|
||||||
|
|
||||||
|
models = ApplicationRecord.descendants.sort_by(&:name)
|
||||||
|
models -= [Ban, Favorite, IpAddress, TagRelationship, ArtistVersion, ArtistCommentaryVersion, NoteVersion, PoolVersion, PostVersion, WikiPageVersion]
|
||||||
|
models
|
||||||
|
end
|
||||||
|
|
||||||
|
def enabled?
|
||||||
|
credentials.present?
|
||||||
|
end
|
||||||
|
|
||||||
|
def export!
|
||||||
|
return unless enabled? && records.any?
|
||||||
|
|
||||||
|
file = dump_records!
|
||||||
|
upload_to_bigquery!(file)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Dump the model records to a gzipped, newline-delimited JSON tempfile.
|
||||||
|
def dump_records!
|
||||||
|
file = Tempfile.new("danbooru-export-dump-", binmode: true)
|
||||||
|
file = Zlib::GzipWriter.new(file)
|
||||||
|
|
||||||
|
CurrentUser.scoped(User.anonymous) do
|
||||||
|
records.find_each(batch_size: 5_000) do |record|
|
||||||
|
file.puts(record.to_json)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
file.close # flush zlib footer
|
||||||
|
file
|
||||||
|
end
|
||||||
|
|
||||||
|
# GCS: gs://danbooru_public/data/{model}.json
|
||||||
|
# BQ: danbooru1.danbooru_public.{model}
|
||||||
|
def upload_to_bigquery!(file)
|
||||||
|
table_name = model.model_name.collection
|
||||||
|
gsfilename = "data/#{table_name}.json"
|
||||||
|
|
||||||
|
gsfile = bucket.create_file(file.path, gsfilename, content_encoding: "gzip")
|
||||||
|
job = dataset.load_job(table_name, gsfile, format: "json", autodetect: true, create: "needed", write: "truncate")
|
||||||
|
|
||||||
|
job.wait_until_done!
|
||||||
|
job
|
||||||
|
end
|
||||||
|
|
||||||
|
# private
|
||||||
|
|
||||||
|
def records
|
||||||
|
model.visible(User.anonymous)
|
||||||
|
end
|
||||||
|
|
||||||
|
def dataset
|
||||||
|
bigquery.dataset(dataset_name) || bigquery.create_dataset(dataset_name)
|
||||||
|
end
|
||||||
|
|
||||||
|
def bucket
|
||||||
|
storage.bucket(dataset_name) || storage.create_bucket(dataset_name, acl: "public", default_acl: "public", storage_class: "standard", location: "us-east1")
|
||||||
|
end
|
||||||
|
|
||||||
|
def bigquery
|
||||||
|
Google::Cloud::Bigquery.new(credentials: credentials)
|
||||||
|
end
|
||||||
|
|
||||||
|
def storage
|
||||||
|
Google::Cloud::Storage.new(credentials: credentials)
|
||||||
|
end
|
||||||
|
|
||||||
|
def default_credentials
|
||||||
|
return nil unless Danbooru.config.google_cloud_credentials.present?
|
||||||
|
JSON.parse(Danbooru.config.google_cloud_credentials)
|
||||||
|
end
|
||||||
|
|
||||||
|
memoize :dataset, :bucket, :bigquery, :storage, :default_credentials
|
||||||
|
end
|
||||||
@@ -15,6 +15,7 @@ module DanbooruMaintenance
|
|||||||
safely { BulkUpdateRequestPruner.warn_old }
|
safely { BulkUpdateRequestPruner.warn_old }
|
||||||
safely { BulkUpdateRequestPruner.reject_expired }
|
safely { BulkUpdateRequestPruner.reject_expired }
|
||||||
safely { Ban.prune! }
|
safely { Ban.prune! }
|
||||||
|
safely { BigqueryExportService.async_export_all! }
|
||||||
safely { ActiveRecord::Base.connection.execute("vacuum analyze") unless Rails.env.test? }
|
safely { ActiveRecord::Base.connection.execute("vacuum analyze") unless Rails.env.test? }
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -512,6 +512,18 @@ module Danbooru
|
|||||||
def cloudflare_zone
|
def cloudflare_zone
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Google Cloud API key. Used for exporting data to BigQuery and to Google
|
||||||
|
# Cloud Storage. Should be the JSON key object you get after creating a
|
||||||
|
# service account. Must have the "BigQuery User" and "Storage Admin" roles.
|
||||||
|
#
|
||||||
|
# * Go to https://console.cloud.google.com/iam-admin/serviceaccounts and create a service account.
|
||||||
|
# * Go to "Keys" and add a new key.
|
||||||
|
# * Go to https://console.cloud.google.com/iam-admin/iam and add the
|
||||||
|
# BigQuery User and Storage Admin roles to the service account.
|
||||||
|
# * Paste the JSON key file here.
|
||||||
|
def google_cloud_credentials
|
||||||
|
end
|
||||||
|
|
||||||
# The URL for the recommender server (https://github.com/evazion/recommender).
|
# The URL for the recommender server (https://github.com/evazion/recommender).
|
||||||
# Optional. Used to generate post recommendations.
|
# Optional. Used to generate post recommendations.
|
||||||
# Set to http://localhost/mock/recommender to enable a fake recommender
|
# Set to http://localhost/mock/recommender to enable a fake recommender
|
||||||
|
|||||||
19
test/unit/bigquery_export_service_test.rb
Normal file
19
test/unit/bigquery_export_service_test.rb
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
require 'test_helper'
|
||||||
|
|
||||||
|
class BigqueryExportServiceTest < ActiveSupport::TestCase
|
||||||
|
context "BigqueryExportService: " do
|
||||||
|
context "#async_export_all! method" do
|
||||||
|
should "export all tables to BigQuery" do
|
||||||
|
@post = create(:post, tag_string: "tagme")
|
||||||
|
@bigquery = BigqueryExportService.new(dataset_name: "testbooru_export")
|
||||||
|
skip unless @bigquery.enabled?
|
||||||
|
|
||||||
|
BigqueryExportService.async_export_all!(dataset_name: "testbooru_export")
|
||||||
|
perform_enqueued_jobs
|
||||||
|
|
||||||
|
assert_equal(1, @bigquery.dataset.table("posts").rows_count)
|
||||||
|
assert_equal(1, @bigquery.dataset.table("tags").rows_count)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
Reference in New Issue
Block a user