diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f2d756acf..f834636c4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,6 +49,7 @@ jobs: DANBOORU_RAKISMET_KEY: ${{ secrets.DANBOORU_RAKISMET_KEY }} DANBOORU_RAKISMET_URL: ${{ secrets.DANBOORU_RAKISMET_URL }} DANBOORU_IP_REGISTRY_API_KEY: ${{ secrets.DANBOORU_IP_REGISTRY_API_KEY }} + DANBOORU_GOOGLE_CLOUD_CREDENTIALS: ${{ secrets.DANBOORU_GOOGLE_CLOUD_CREDENTIALS }} DANBOORU_STRIPE_SECRET_KEY: ${{ secrets.DANBOORU_STRIPE_SECRET_KEY }} DANBOORU_STRIPE_PUBLISHABLE_KEY: ${{ secrets.DANBOORU_STRIPE_PUBLISHABLE_KEY }} DANBOORU_STRIPE_WEBHOOK_SECRET: ${{ secrets.DANBOORU_STRIPE_WEBHOOK_SECRET }} diff --git a/Gemfile b/Gemfile index a58d5bd6f..f009eb6c4 100644 --- a/Gemfile +++ b/Gemfile @@ -47,6 +47,8 @@ gem 'nokogiri' gem 'view_component', require: 'view_component/engine' gem 'tzinfo-data' gem 'hsluv' +gem 'google-cloud-bigquery', require: "google/cloud/bigquery" +gem 'google-cloud-storage', require: "google/cloud/storage" group :production, :staging do gem 'unicorn', :platforms => :ruby diff --git a/Gemfile.lock b/Gemfile.lock index 015148258..d85a361dd 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -161,12 +161,16 @@ GEM daemons (1.3.1) dante (0.2.0) debug_inspector (1.0.0) + declarative (0.0.20) + declarative-option (0.1.0) delayed_job (4.1.9) activesupport (>= 3.0, < 6.2) delayed_job_active_record (4.1.5) activerecord (>= 3.0, < 6.2) delayed_job (>= 3.0, < 5) diff-lcs (1.4.4) + digest-crc (0.6.3) + rake (>= 12.0.0, < 14.0.0) docile (1.3.5) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) @@ -192,6 +196,49 @@ GEM ffi (~> 1.0) globalid (0.4.2) activesupport (>= 4.2.0) + google-apis-bigquery_v2 (0.6.0) + google-apis-core (~> 0.1) + google-apis-core (0.3.0) + addressable (~> 2.5, >= 2.5.1) + googleauth (~> 0.14) + httpclient (>= 2.8.1, < 3.0) + mini_mime (~> 1.0) + representable (~> 3.0) + retriable (>= 2.0, < 4.0) + rexml + signet (~> 0.14) + webrick + google-apis-iamcredentials_v1 (0.2.0) + google-apis-core (~> 0.1) + google-apis-storage_v1 (0.3.0) + google-apis-core (~> 0.1) + google-cloud-bigquery (1.28.0) + concurrent-ruby (~> 1.0) + google-apis-bigquery_v2 (~> 0.1) + google-cloud-core (~> 1.2) + googleauth (~> 0.9) + mini_mime (~> 1.0) + google-cloud-core (1.5.0) + google-cloud-env (~> 1.0) + google-cloud-errors (~> 1.0) + google-cloud-env (1.5.0) + faraday (>= 0.17.3, < 2.0) + google-cloud-errors (1.0.1) + google-cloud-storage (1.30.0) + addressable (~> 2.5) + digest-crc (~> 0.4) + google-apis-iamcredentials_v1 (~> 0.1) + google-apis-storage_v1 (~> 0.1) + google-cloud-core (~> 1.2) + googleauth (~> 0.9) + mini_mime (~> 1.0) + googleauth (0.16.0) + faraday (>= 0.17.3, < 2.0) + jwt (>= 1.4, < 3.0) + memoist (~> 0.16) + multi_json (~> 1.11) + os (>= 0.9, < 2.0) + signet (~> 0.14) hsluv (1.0.1) http (4.4.1) addressable (~> 2.3) @@ -201,6 +248,7 @@ GEM http-form_data (2.3.0) http-parser (1.2.3) ffi-compiler (>= 1.0, < 2.0) + httpclient (2.8.3) i18n (1.8.9) concurrent-ruby (~> 1.0) ipaddress_2 (0.13.0) @@ -257,6 +305,7 @@ GEM multi_json (~> 1.3) multi_xml (~> 0.5) rack (>= 1.2, < 3) + os (1.1.1) parallel (1.20.1) parser (3.0.0.0) ast (~> 2.4.1) @@ -321,9 +370,14 @@ GEM json redis (4.2.5) regexp_parser (2.1.1) + representable (3.0.4) + declarative (< 0.1.0) + declarative-option (< 0.2.0) + uber (< 0.2.0) responders (3.0.1) actionpack (>= 5.0) railties (>= 5.0) + retriable (3.1.2) rexml (3.2.4) rubocop (1.11.0) parallel (~> 1.10) @@ -359,6 +413,11 @@ GEM shoulda-context (2.0.0) shoulda-matchers (4.5.1) activesupport (>= 4.2.0) + signet (0.15.0) + addressable (~> 2.3) + faraday (>= 0.17.3, < 2.0) + jwt (>= 1.5, < 3.0) + multi_json (~> 1.10) simple_form (5.1.0) actionpack (>= 5.2) activemodel (>= 5.2) @@ -391,6 +450,7 @@ GEM concurrent-ruby (~> 1.0) tzinfo-data (1.2021.1) tzinfo (>= 1.0.0) + uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.7.7) @@ -408,6 +468,7 @@ GEM rack-proxy (>= 0.6.1) railties (>= 5.2) semantic_range (>= 2.3.0) + webrick (1.7.0) websocket-driver (0.7.3) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) @@ -447,6 +508,8 @@ DEPENDENCIES factory_bot ffaker flamegraph + google-cloud-bigquery + google-cloud-storage hsluv http http-cookie! diff --git a/app/jobs/bigquery_export_job.rb b/app/jobs/bigquery_export_job.rb new file mode 100644 index 000000000..007eea8fd --- /dev/null +++ b/app/jobs/bigquery_export_job.rb @@ -0,0 +1,7 @@ +class BigqueryExportJob < ApplicationJob + retry_on Exception, attempts: 0 + + def perform(model:, **options) + BigqueryExportService.new(model, **options).export! + end +end diff --git a/app/logical/bigquery_export_service.rb b/app/logical/bigquery_export_service.rb new file mode 100644 index 000000000..c8d5a957c --- /dev/null +++ b/app/logical/bigquery_export_service.rb @@ -0,0 +1,95 @@ +# Export all public data in a model to BigQuery and to Google Cloud Storage. + +class BigqueryExportService + extend Memoist + + attr_reader :model, :dataset_name, :credentials + + def initialize(model = nil, dataset_name: "danbooru_public", credentials: default_credentials) + @model = model + @dataset_name = dataset_name + @credentials = credentials + end + + def self.async_export_all!(**options) + models.each do |model| + BigqueryExportJob.perform_later(model: model, **options) + end + end + + def self.models + Rails.application.eager_load! + + models = ApplicationRecord.descendants.sort_by(&:name) + models -= [Ban, Favorite, IpAddress, TagRelationship, ArtistVersion, ArtistCommentaryVersion, NoteVersion, PoolVersion, PostVersion, WikiPageVersion] + models + end + + def enabled? + credentials.present? + end + + def export! + return unless enabled? && records.any? + + file = dump_records! + upload_to_bigquery!(file) + end + + # Dump the model records to a gzipped, newline-delimited JSON tempfile. + def dump_records! + file = Tempfile.new("danbooru-export-dump-", binmode: true) + file = Zlib::GzipWriter.new(file) + + CurrentUser.scoped(User.anonymous) do + records.find_each(batch_size: 5_000) do |record| + file.puts(record.to_json) + end + end + + file.close # flush zlib footer + file + end + + # GCS: gs://danbooru_public/data/{model}.json + # BQ: danbooru1.danbooru_public.{model} + def upload_to_bigquery!(file) + table_name = model.model_name.collection + gsfilename = "data/#{table_name}.json" + + gsfile = bucket.create_file(file.path, gsfilename, content_encoding: "gzip") + job = dataset.load_job(table_name, gsfile, format: "json", autodetect: true, create: "needed", write: "truncate") + + job.wait_until_done! + job + end + + # private + + def records + model.visible(User.anonymous) + end + + def dataset + bigquery.dataset(dataset_name) || bigquery.create_dataset(dataset_name) + end + + def bucket + storage.bucket(dataset_name) || storage.create_bucket(dataset_name, acl: "public", default_acl: "public", storage_class: "standard", location: "us-east1") + end + + def bigquery + Google::Cloud::Bigquery.new(credentials: credentials) + end + + def storage + Google::Cloud::Storage.new(credentials: credentials) + end + + def default_credentials + return nil unless Danbooru.config.google_cloud_credentials.present? + JSON.parse(Danbooru.config.google_cloud_credentials) + end + + memoize :dataset, :bucket, :bigquery, :storage, :default_credentials +end diff --git a/app/logical/danbooru_maintenance.rb b/app/logical/danbooru_maintenance.rb index 48acb7c0e..aea7a73d1 100644 --- a/app/logical/danbooru_maintenance.rb +++ b/app/logical/danbooru_maintenance.rb @@ -15,6 +15,7 @@ module DanbooruMaintenance safely { BulkUpdateRequestPruner.warn_old } safely { BulkUpdateRequestPruner.reject_expired } safely { Ban.prune! } + safely { BigqueryExportService.async_export_all! } safely { ActiveRecord::Base.connection.execute("vacuum analyze") unless Rails.env.test? } end diff --git a/config/danbooru_default_config.rb b/config/danbooru_default_config.rb index 7bd5bc404..79ce504f1 100644 --- a/config/danbooru_default_config.rb +++ b/config/danbooru_default_config.rb @@ -512,6 +512,18 @@ module Danbooru def cloudflare_zone end + # Google Cloud API key. Used for exporting data to BigQuery and to Google + # Cloud Storage. Should be the JSON key object you get after creating a + # service account. Must have the "BigQuery User" and "Storage Admin" roles. + # + # * Go to https://console.cloud.google.com/iam-admin/serviceaccounts and create a service account. + # * Go to "Keys" and add a new key. + # * Go to https://console.cloud.google.com/iam-admin/iam and add the + # BigQuery User and Storage Admin roles to the service account. + # * Paste the JSON key file here. + def google_cloud_credentials + end + # The URL for the recommender server (https://github.com/evazion/recommender). # Optional. Used to generate post recommendations. # Set to http://localhost/mock/recommender to enable a fake recommender diff --git a/test/unit/bigquery_export_service_test.rb b/test/unit/bigquery_export_service_test.rb new file mode 100644 index 000000000..179d49969 --- /dev/null +++ b/test/unit/bigquery_export_service_test.rb @@ -0,0 +1,19 @@ +require 'test_helper' + +class BigqueryExportServiceTest < ActiveSupport::TestCase + context "BigqueryExportService: " do + context "#async_export_all! method" do + should "export all tables to BigQuery" do + @post = create(:post, tag_string: "tagme") + @bigquery = BigqueryExportService.new(dataset_name: "testbooru_export") + skip unless @bigquery.enabled? + + BigqueryExportService.async_export_all!(dataset_name: "testbooru_export") + perform_enqueued_jobs + + assert_equal(1, @bigquery.dataset.table("posts").rows_count) + assert_equal(1, @bigquery.dataset.table("tags").rows_count) + end + end + end +end