Export public database dumps to BigQuery.

* Export daily public database dumps to BigQuery and Google Cloud Storage.
* Only data visible to anonymous users is exported. Some tables have
  null or missing fields because of this.
* The bans table is excluded because some bans have an expires_at
  timestamp set beyond year 9999, which BigQuery doesn't support.
* The favorites table is excluded because it's too slow to dump (it
  doesn't have an id index, which is needed by find_each).
* Version tables are excluded because dumping them every day is
  inefficient, streaming insertions should be used instead.

Links:

* https://console.cloud.google.com/bigquery?project=danbooru1
* https://console.cloud.google.com/storage/browser/danbooru_public
* https://storage.googleapis.com/danbooru_public/data/posts.json
This commit is contained in:
evazion
2021-03-10 01:31:32 -06:00
parent 5623cfb145
commit f235b72b3f
8 changed files with 200 additions and 0 deletions

View File

@@ -161,12 +161,16 @@ GEM
daemons (1.3.1)
dante (0.2.0)
debug_inspector (1.0.0)
declarative (0.0.20)
declarative-option (0.1.0)
delayed_job (4.1.9)
activesupport (>= 3.0, < 6.2)
delayed_job_active_record (4.1.5)
activerecord (>= 3.0, < 6.2)
delayed_job (>= 3.0, < 5)
diff-lcs (1.4.4)
digest-crc (0.6.3)
rake (>= 12.0.0, < 14.0.0)
docile (1.3.5)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
@@ -192,6 +196,49 @@ GEM
ffi (~> 1.0)
globalid (0.4.2)
activesupport (>= 4.2.0)
google-apis-bigquery_v2 (0.6.0)
google-apis-core (~> 0.1)
google-apis-core (0.3.0)
addressable (~> 2.5, >= 2.5.1)
googleauth (~> 0.14)
httpclient (>= 2.8.1, < 3.0)
mini_mime (~> 1.0)
representable (~> 3.0)
retriable (>= 2.0, < 4.0)
rexml
signet (~> 0.14)
webrick
google-apis-iamcredentials_v1 (0.2.0)
google-apis-core (~> 0.1)
google-apis-storage_v1 (0.3.0)
google-apis-core (~> 0.1)
google-cloud-bigquery (1.28.0)
concurrent-ruby (~> 1.0)
google-apis-bigquery_v2 (~> 0.1)
google-cloud-core (~> 1.2)
googleauth (~> 0.9)
mini_mime (~> 1.0)
google-cloud-core (1.5.0)
google-cloud-env (~> 1.0)
google-cloud-errors (~> 1.0)
google-cloud-env (1.5.0)
faraday (>= 0.17.3, < 2.0)
google-cloud-errors (1.0.1)
google-cloud-storage (1.30.0)
addressable (~> 2.5)
digest-crc (~> 0.4)
google-apis-iamcredentials_v1 (~> 0.1)
google-apis-storage_v1 (~> 0.1)
google-cloud-core (~> 1.2)
googleauth (~> 0.9)
mini_mime (~> 1.0)
googleauth (0.16.0)
faraday (>= 0.17.3, < 2.0)
jwt (>= 1.4, < 3.0)
memoist (~> 0.16)
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (~> 0.14)
hsluv (1.0.1)
http (4.4.1)
addressable (~> 2.3)
@@ -201,6 +248,7 @@ GEM
http-form_data (2.3.0)
http-parser (1.2.3)
ffi-compiler (>= 1.0, < 2.0)
httpclient (2.8.3)
i18n (1.8.9)
concurrent-ruby (~> 1.0)
ipaddress_2 (0.13.0)
@@ -257,6 +305,7 @@ GEM
multi_json (~> 1.3)
multi_xml (~> 0.5)
rack (>= 1.2, < 3)
os (1.1.1)
parallel (1.20.1)
parser (3.0.0.0)
ast (~> 2.4.1)
@@ -321,9 +370,14 @@ GEM
json
redis (4.2.5)
regexp_parser (2.1.1)
representable (3.0.4)
declarative (< 0.1.0)
declarative-option (< 0.2.0)
uber (< 0.2.0)
responders (3.0.1)
actionpack (>= 5.0)
railties (>= 5.0)
retriable (3.1.2)
rexml (3.2.4)
rubocop (1.11.0)
parallel (~> 1.10)
@@ -359,6 +413,11 @@ GEM
shoulda-context (2.0.0)
shoulda-matchers (4.5.1)
activesupport (>= 4.2.0)
signet (0.15.0)
addressable (~> 2.3)
faraday (>= 0.17.3, < 2.0)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
simple_form (5.1.0)
actionpack (>= 5.2)
activemodel (>= 5.2)
@@ -391,6 +450,7 @@ GEM
concurrent-ruby (~> 1.0)
tzinfo-data (1.2021.1)
tzinfo (>= 1.0.0)
uber (0.1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.7)
@@ -408,6 +468,7 @@ GEM
rack-proxy (>= 0.6.1)
railties (>= 5.2)
semantic_range (>= 2.3.0)
webrick (1.7.0)
websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
@@ -447,6 +508,8 @@ DEPENDENCIES
factory_bot
ffaker
flamegraph
google-cloud-bigquery
google-cloud-storage
hsluv
http
http-cookie!