Home Mysql 继续处理大数据量的精简hugedata
Post
Cancel

Mysql 继续处理大数据量的精简hugedata

从下面可以看到各种结果集的数量

mysql> select count(*) from orders  where state = 200;
+-----------+
| count(*)  |
+-----------+
| 194544642 |
+-----------+
1 row in set (1 min 12.58 sec)

mysql> select count(*) from orders  where state = 100;
+----------+
| count(*) |
+----------+
|    14119 |
+----------+
1 row in set (0.02 sec)

mysql> select count(*) from orders  where state = 0;
+-----------+
| count(*)  |
+-----------+
| 337920287 |
+-----------+
1 row in set (2 min 7.72 sec)

mysql> select count(*) from orders;
+-----------+
| count(*)  |
+-----------+
| 532479048 |
+-----------+
1 row in set (31 min 13.40 sec)

mysql> select count(*) from trades;
+-----------+
| count(*)  |
+-----------+
| 104786793 |
+-----------+
1 row in set (5 min 26.64 sec)

mysql> select count(*) from trades where ask_member_id != bid_member_id;
+----------+
| count(*) |
+----------+
|  6183063 |
+----------+
1 row in set (5 min 32.72 sec)

查看不同交易对的成交数量

mysql> select count(*) as the_count, market_id from trades group by market_id;
+-----------+-----------+
| the_count | market_id |
+-----------+-----------+
|   1579304 | arpacg    |
|   1572511 | arpausdt  |
|   1653507 | batcg     |
|   1648798 | batusdt   |
|   1536013 | bhtcg     |
|   1541508 | bhtusdt   |
|   1715341 | btccg     |
|   1864639 | btcusdt   |
|   7667576 | cabcg     |
|   1786793 | cabusdt   |
|   1593117 | crecg     |
|   1558788 | creusdt   |
|   1626310 | crocg     |
|   1630426 | crousdt   |
|   1552852 | cvccg     |
|   1540007 | cvcusdt   |
|   1657724 | ektcg     |
|   2098719 | ektusdt   |
|   1672984 | elfcg     |
|   1672089 | elfusdt   |
|    568978 | ethcg     |
|   1637454 | ethusdt   |
|   1589476 | fttcg     |
|   1650413 | fttusdt   |
|   1445482 | gntcg     |
|   1456987 | gntusdt   |
|   1507198 | gtcg      |
|   1500125 | gtusdt    |
|   1544403 | hptcg     |
|   1527532 | hptusdt   |
|   1618695 | htcg      |
|   1638050 | htusdt    |
|    255891 | iicg      |
|   1576644 | kancg     |
|   1520381 | kanusdt   |
|   1639140 | linkcg    |
|   1645950 | linkusdt  |
|   1656791 | manacg    |
|   1654290 | manausdt  |
|   1568851 | mxcg      |
|   1564451 | mxusdt    |
|   1662618 | omgcg     |
|   1668741 | omgusdt   |
|   1649757 | rencg     |
|   1646899 | renusdt   |
|   1565752 | rsrcg     |
|   1548872 | rsrusdt   |
|   1684195 | seelecg   |
|   1652623 | seeleusdt |
|   1590981 | smtcg     |
|   1567765 | smtusdt   |
|   1655702 | sntcg     |
|   1660345 | sntusdt   |
|      2637 | sssusdt   |
|   1545870 | storjcg   |
|   1529647 | storjusdt |
|       163 | tlpusdt   |
|   1623956 | topcg     |
|   1560439 | topusdt   |
|    272516 | usdtcg    |
|   1578944 | wtccg     |
|   1572284 | wtcusdt   |
|   1648049 | zrxcg     |
|   1652394 | zrxusdt   |
|   2309456 | zztusdt   |
+-----------+-----------+
65 rows in set (44.21 sec)

查看不同交易对的人为交易

select count(*) as the_count, market_id from trades where ask_member_id != bid_member_id group by market_id;

+-----------+-----------+
| the_count | market_id |
+-----------+-----------+
|        11 | arpacg    |
|         3 | arpausdt  |
|         3 | batcg     |
|         6 | bhtcg     |
|         1 | bhtusdt   |
|        35 | btccg     |
|         9 | btcusdt   |
|   3404631 | cabcg     |
|   1051027 | cabusdt   |
|       337 | crecg     |
|        29 | creusdt   |
|      1578 | crocg     |
|         9 | cvccg     |
|      1926 | cvcusdt   |
|        39 | ektcg     |
|         6 | ektusdt   |
|        69 | elfcg     |
|       128 | ethcg     |
|        62 | ethusdt   |
|        31 | fttcg     |
|       400 | gntcg     |
|         9 | gtcg      |
|        32 | gtusdt    |
|       575 | htcg      |
|       320 | htusdt    |
|     17143 | iicg      |
|         8 | kancg     |
|        22 | kanusdt   |
|        31 | linkcg    |
|         1 | linkusdt  |
|      2537 | manacg    |
|        34 | mxcg      |
|       137 | omgcg     |
|        26 | omgusdt   |
|        29 | rencg     |
|     31156 | rsrcg     |
|     13491 | rsrusdt   |
|        37 | seelecg   |
|      6460 | seeleusdt |
|       104 | smtcg     |
|     12910 | smtusdt   |
|       712 | sntcg     |
|      6904 | sntusdt   |
|      1878 | sssusdt   |
|       790 | storjcg   |
|       146 | tlpusdt   |
|     55786 | topcg     |
|       547 | topusdt   |
|       247 | usdtcg    |
|         7 | wtccg     |
|       284 | zrxcg     |
|         5 | zrxusdt   |
|   1570355 | zztusdt   |
+-----------+-----------+
53 rows in set (4 hours 54 min 43.07 sec)

下面是一些时间

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

mysql> use cadae;                                                                                                                                 
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
mysql> drop table if exists temp_orders;                                                                                                          
Query OK, 0 rows affected (0.26 sec)

mysql> create table temp_orders like orders;                                                                                                     
 Query OK, 0 rows affected (0.05 sec)

mysql> insert into temp_orders select * from orders where market_id in ('cabcg');                                                                 
Query OK, 7782140 rows affected (32 min 59.59 sec)
Records: 7782140  Duplicates: 0  Warnings: 0

mysql> insert into temp_orders select * from orders where market_id in ('cabusdt');
Query OK, 1801003 rows affected (2 min 39.08 sec)
Records: 1801003  Duplicates: 0  Warnings: 0

mysql> insert into temp_orders select * from orders where market_id in ('zztusdt');
 Query OK, 2587381 rows affected (24 min 22.60 sec)
Records: 2587381  Duplicates: 0  Warnings: 0

mysql>  drop table if exists temp_trades;
Query OK, 0 rows affected, 1 warning (0.02 sec)

mysql> create table temp_trades like trades;
Query OK, 0 rows affected (0.11 sec)

mysql> insert into temp_trades select * from trades where market_id in ('cabcg');
Query OK, 7752107 rows affected (46 min 47.78 sec)
Records: 7752107  Duplicates: 0  Warnings: 0

mysql> insert into temp_trades select * from trades where market_id in ('cabusdt');
Query OK, 1786793 rows affected (8 min 44.51 sec)
Records: 1786793  Duplicates: 0  Warnings: 0

mysql> 
mysql> 
mysql> 
mysql> insert into temp_trades select * from trades where market_id in ('zztusdt');
Query OK, 2516337 rows affected (5 min 42.15 sec)
Records: 2516337  Duplicates: 0  Warnings: 0


	select count(*) from trades where ask_member_id = bid_member_id and bid_member_id not in (31237,31238,28665,28652,31217,31218,28155,28044,28158,28180,28187,28188,28189,28190,28191,31229,31230,28664,28651,31221,31222,31231,31232,28670,28657,28156,28045,28666,28653,28668,28655,31211,31212,31213,31214,28662,28649,28200,31225,31226,28661,28648,28660,28647,31215,31216,28659,28646,28671,28658,31223,31224,28669,28656,31235,31236,28667,28654,31227,31228,31233,31234,28157,31219,31220,28663,28650,45402,45401,27997,27998,27999,)  and market_id not in ('cabcg', 'cabusdt', 'zztusdt') group by ask_member_id;

+----------+---------------+
| count(*) | ask_member_id |
+----------+---------------+
|        9 |            19 |
|        3 |         16664 |
|        2 |         16899 |
|       12 |         27847 |
|      665 |         27997 |
|       41 |         27998 |
|       36 |         27999 |
|        1 |         28102 |
|        3 |         28202 |
|        1 |         39023 |
|        8 |         39024 |
|        3 |         39025 |
|        5 |         39028 |
+----------+---------------+
13 rows in set (4 min 33.95 sec)

mysql> insert into temp_trades select * from trades where ask_member_id = bid_member_id and bid_member_id not in (31237,31238,28665,28652,31217,31218,28155,28044,28158,28180,28187,28188,28189,28190,28191,31229,31230,28664,28651,31221,31222,31231,31232,28670,28657,28156,28045,28666,28653,28668,28655,31211,31212,31213,31214,28662,28649,28200,31225,31226,28661,28648,28660,28647,31215,31216,28659,28646,28671,28658,31223,31224,28669,28656,31235,31236,28667,28654,31227,31228,31233,31234,28157,31219,31220,28663,28650,45402,45401,27997,27998,27999,28102,28202,39023,39024,39025,39028,19,27847)  and market_id not in ('cabcg', 'cabusdt', 'zztusdt');
Query OK, 5 rows affected (4 min 37.64 sec)
Records: 5  Duplicates: 0  Warnings: 0


mysql> insert into temp_trades select * from trades where ask_member_id != bid_member_id and market_id not in ('cabcg', 'zztusdt', 'cabusdt');
Query OK, 185942 rows affected (5 min 13.55 sec)
Records: 185942  Duplicates: 0  Warnings: 0

然后运行 逻辑代码中的SQL: 

下面是完整代码

ENV['RAILS_ENV'] = ARGV.first || ENV['RAILS_ENV'] || 'production'
require File.expand_path(File.dirname(__FILE__) + "/../config/environment")
require 'rails'
require 'rubygems'
require 'rufus/scheduler'


# 精简计划:
#
# 这一步可以先放着. 在老的数据库上, 日后再做:(也就是说,容忍老的机器人账户的帐不平先)
# 让机器人的手续费都返回来.  ( trade fees, 返回 给对应的 ask member, bid member) 这样就平了.
#
# 同时,老一代机器人账户退休,  新一代机器人账户上场
#
#
# 删掉所有机器人自成交的trades ( ask_member_id = bid_member_id  and ask_member_id in (robot_ids)
#
# 删掉所有机器人自成交的trades对应的orders ( 保留所有已经存在的trades 对应的 orders )
#
# 删掉所有取消的orders ( state = 0 and volume == origin_volume)
#
# 找出所有 trade id 不是 robot = robot的情况
#
#
#
#
# 进一步思路:
# 查看不同交易对的人为交易
#select count(*) as the_count, market_id from trades where ask_member_id != bid_member_id group by market_id;
#+-----------+-----------+
#| the_count | market_id |
#+-----------+-----------+
#|        11 | arpacg    |仅仅保留人机 orders + trades
#|         3 | arpausdt  |仅仅保留人机 orders + trades
#|         3 | batcg     |仅仅保留人机 orders + trades
#|         6 | bhtcg     |仅仅保留人机 orders + trades
#|         1 | bhtusdt   |仅仅保留人机 orders + trades
#|        35 | btccg     |仅仅保留人机 orders + trades
#|         9 | btcusdt   |仅仅保留人机 orders + trades
#|   3404631 | cabcg     |全都保留 orders + trades
#|   1051027 | cabusdt   |全都保留
#|       337 | crecg     |仅仅保留人机 orders + trades
#|        29 | creusdt   |仅仅保留人机 orders + trades
#|      1578 | crocg     |仅仅保留人机 orders + trades
#|         9 | cvccg     |仅仅保留人机 orders + trades
#|      1926 | cvcusdt   |仅仅保留人机 orders + trades
#|        39 | ektcg     |仅仅保留人机 orders + trades
#|         6 | ektusdt   |仅仅保留人机 orders + trades
#|        69 | elfcg     |仅仅保留人机 orders + trades
#|       128 | ethcg     |.....
#|        62 | ethusdt   |.....
#|        31 | fttcg     |.....
#|       400 | gntcg     |
#|         9 | gtcg      |
#|        32 | gtusdt    |
#|       575 | htcg      |
#|       320 | htusdt    |
#|     17143 | iicg      |
#|         8 | kancg     |
#|        22 | kanusdt   |
#|        31 | linkcg    |
#|         1 | linkusdt  |
#|      2537 | manacg    |
#|        34 | mxcg      |
#|       137 | omgcg     |
#|        26 | omgusdt   |
#|        29 | rencg     |
#|     31156 | rsrcg     |
#|     13491 | rsrusdt   |
#|        37 | seelecg   |
#|      6460 | seeleusdt |
#|       104 | smtcg     |
#|     12910 | smtusdt   |
#|       712 | sntcg     |
#|      6904 | sntusdt   |
#|      1878 | sssusdt   |
#|       790 | storjcg   |
#|       146 | tlpusdt   |
#|     55786 | topcg     |
#|       547 | topusdt   |
#|       247 | usdtcg    |
#|         7 | wtccg     |
#|       284 | zrxcg     |
#|         5 | zrxusdt   |
#|   1570355 | zztusdt   | 全都保留
#+-----------+-----------+

# 是否不执行,只演练
@@is_dry_run = true

def run_sql sql
  if @@is_dry_run
    puts "sql: #{sql}"
  else
    ActiveRecord::Base.connection.execute(sql)
  end
end

# 所有的 market_id in (cabcg, cabusdt, zztusdt)
# orders, trades都保留, 大约500W
run_sql "drop table if exists temp_orders"
run_sql "create table temp_orders like orders;"
run_sql "insert into temp_orders select * from orders where market_id in ('cabcg');"
run_sql "insert into temp_orders select * from orders where market_id in ('cabusdt');"
run_sql "insert into temp_orders select * from orders where market_id in ('zztusdt');"
run_sql "drop table if exists temp_trades"
run_sql "create table temp_trades like trades;"
#select * from trades where ask_member_id != bid_member_id and market_id not in ('cabcg', 'zztusdt', 'cabusdt');
# 下面这里, 有可能 普通用户也会 自成交. 所以,我们还要一个sql , 来查询这个
run_sql %Q{
  insert into temp_trades select * from trades where ask_member_id != bid_member_id and market_id not in ('cabcg', 'zztusdt', 'cabusdt');
}


#trades = Trade.where('ask_member_id != bid_member_id and market_id not in ?', ['cabcg', 'zztusdt', 'cabusdt'])
#trade_ids = trades.map(&:id)

# 上一步的人机trades, 对应的 orders, 都保留  ( 获得对应的order id )

@@is_dry_run = false
temp_ask_order_ids = []
temp_bid_order_ids = []
puts "== before ... #{Time.now}"
TempTrade.find_each do |trade|
  temp_ask_order_ids << trade.ask_order_id
  temp_bid_order_ids << trade.bid_order_id
end
puts "== after... #{Time.now}, temp_ask_order_ids.size:"
puts temp_ask_order_ids.size

all_order_ids = (temp_ask_order_ids + temp_bid_order_ids ).uniq

## 貌似这里是有query string 的长度限制的. 我们就500 一组的执行吧.
all_order_ids.each_slice(500) do |five_hundred_order_ids|
  run_sql = "insert into temp_orders select * from orders where id in (#{five_hundred_order_ids.join(',')}); "
end

# 其他的 orders, trades 都删除 (也就是都不要了, 放着就好)

run_sql 'drop table orders;'
run_sql 'drop table trades;'
run_sql 'rename table temp_orders to orders;'
run_sql 'rename table temp_trades to trades;'

puts " 接下来就是手动 根据对账,把帐抹平, 因为记录特别少,所以跑一遍对账就可以了. 大约60~120个账户"
puts " 同时,老一代机器人账户退休,  新一代机器人账户上场"
# robot ids
temp = %w{31237 31238 28665 28652 31217 31218 28155 28044 28158 28180 28187 28188 28189 28190 28191 31229 31230 28664 28651 31221 31222 31231 31232 28670 28657 28156 28045 28666 28653 28668 28655 31211 31212 31213 31214 28662 28649 28200 31225 31226 28661 28648 28660 28647 31215 31216 28659 28646 28671 28658 31223 31224 28669 28656 31235 31236 28667 28654 31227 31228 31233 31234 28157 31219 31220 28663 28650 45402 45401}
puts "== 机器人 id size: #{temp.size}"
puts "== 机器人 id : #{temp.inspect}"




# 后期:
# 1. 上可以删除取消 orders 的 撮合引擎(永建做的ruby 版本)
# 2. 已经成交的trades, 最多保留一周. 一周之后, 机器人自成交的trades, 需要有个脚本来删除
#

This post is licensed under CC BY 4.0 by the author.
Contents