我要评分
获取效率
正确性
完整性
易理解

Ceph Configuration Tuning

Modify Ceph configurations to maximize system resource utilization.

You can modify the /etc/ceph/ceph.conf file to configure all Ceph configuration parameters. The configuration procedure is as follows:

  1. Go to the /etc/ceph/ directory on the ceph1 node and edit the ceph.conf file.
    1
    2
    cd /etc/ceph
    vi ceph.conf 
    
  2. Add the following configurations to the file. Table 1 describes the parameters in the configuration file.
      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    [global]
    
    public_network = 192.168.3.0/24   # Adjust the address based on the actual network segment.
    cluster_network = 192.168.4.0/24  # Adjust the address based on the actual network segment.
    mon_max_pg_per_osd = 3000
    mon_max_pool_pg_num = 300000
    ms_bind_before_connect = true
    ms_dispatch_throttle_bytes = 2097152000
    osd_pool_default_min_size = 0
    osd_pool_default_pg_num = 1024
    osd_pool_default_pgp_num = 1024
    osd_pool_default_size = 3
    throttler_perf_counter = false
    bluefs_buffered_io=false
    
    osd_max_write_size = 256
    osd_enable_op_tracker = false
    
    rbd_cache = false
    
    [mon]
    mon_allow_pool_delete = true
    
    [osd]
    rocksdb_cache_index_and_filter_blocks = false
    rocksdb_cache_size = 2G
    osd_memory_cache_min = 3G
    osd_memory_base = 3G
    
    bluestore_rocksdb_options = use_direct_reads=true,use_direct_io_for_flush_and_compaction=true,compression=kNoCompression,min_write_buffer_number_to_merge=32,recycle_log_file_num=64,compaction_style=kCompactionStyleLevel,write_buffer_size=64M,target_file_size_base=64M,compaction_threads=32,max_bytes_for_level_multiplier=8,flusher_threads=8,level0_file_num_compaction_trigger=16,level0_slowdown_writes_trigger=36,level0_stop_writes_trigger=48,compaction_readahead_size=524288,max_bytes_for_level_base=536870912,enable_pipelined_write=false,max_background_jobs=16,max_background_flushes=8,max_background_compactions=16,max_write_buffer_number=8,soft_pending_compaction_bytes_limit=137438953472,hard_pending_compaction_bytes_limit=274877906944,delayed_write_rate=33554432
    osd_pg_object_context_cache_count = 256
    
    mon_osd_full_ratio = 0.97
    mon_osd_nearfull_ratio = 0.95
    osd_min_pg_log_entries = 10
    osd_max_pg_log_entries = 10
    
    bluestore_cache_meta_ratio = 0.49
    bluestore_cache_kv_ratio = 0.49
    bluestore_cache_size_ssd = 6G
    osd_memory_target = 10G
    bluestore_clone_cow = false
    
    [osd.0]
    osd_numa_node=0
    [osd.1]
    osd_numa_node=0
    [osd.2]
    osd_numa_node=0
    [osd.3]
    osd_numa_node=0
    [osd.4]
    osd_numa_node=0
    [osd.5]
    osd_numa_node=0
    [osd.6]
    osd_numa_node=1
    [osd.7]
    osd_numa_node=1
    [osd.8]
    osd_numa_node=1
    [osd.9]
    osd_numa_node=1
    [osd.10]
    osd_numa_node=1
    [osd.11]
    osd_numa_node=1
    [osd.12]
    osd_numa_node=2
    [osd.13]
    osd_numa_node=2
    [osd.14]
    osd_numa_node=2
    [osd.15]
    osd_numa_node=2
    [osd.16]
    osd_numa_node=2
    [osd.17]
    osd_numa_node=2
    [osd.18]
    osd_numa_node=3
    [osd.19]
    osd_numa_node=3
    [osd.20]
    osd_numa_node=3
    [osd.21]
    osd_numa_node=3
    [osd.22]
    osd_numa_node=3
    [osd.23]
    osd_numa_node=3
    [osd.24]
    osd_numa_node=0
    [osd.25]
    osd_numa_node=0
    [osd.26]
    osd_numa_node=0
    [osd.27]
    osd_numa_node=0
    [osd.28]
    osd_numa_node=0
    [osd.29]
    osd_numa_node=0
    [osd.30]
    osd_numa_node=1
    [osd.31]
    osd_numa_node=1
    [osd.32]
    osd_numa_node=1
    [osd.33]
    osd_numa_node=1
    [osd.34]
    osd_numa_node=1
    [osd.35]
    osd_numa_node=1
    [osd.36]
    osd_numa_node=2
    [osd.37]
    osd_numa_node=2
    [osd.38]
    osd_numa_node=2
    [osd.39]
    osd_numa_node=2
    [osd.40]
    osd_numa_node=2
    [osd.41]
    osd_numa_node=2
    [osd.42]
    osd_numa_node=3
    [osd.43]
    osd_numa_node=3
    [osd.44]
    osd_numa_node=3
    [osd.45]
    osd_numa_node=3
    [osd.46]
    osd_numa_node=3
    [osd.47]
    osd_numa_node=3
    
    [osd.48]
    osd_numa_node=0
    [osd.49]
    osd_numa_node=0
    [osd.50]
    osd_numa_node=0
    [osd.51]
    osd_numa_node=0
    [osd.52]
    osd_numa_node=0
    [osd.53]
    osd_numa_node=0
    [osd.54]
    osd_numa_node=1
    [osd.55]
    osd_numa_node=1
    [osd.56]
    osd_numa_node=1
    [osd.57]
    osd_numa_node=1
    [osd.58]
    osd_numa_node=1
    [osd.59]
    osd_numa_node=1
    [osd.60]
    osd_numa_node=2
    [osd.61]
    osd_numa_node=2
    [osd.62]
    osd_numa_node=2
    [osd.63]
    osd_numa_node=2
    [osd.64]
    osd_numa_node=2
    [osd.65]
    osd_numa_node=2
    [osd.66]
    osd_numa_node=3
    [osd.67]
    osd_numa_node=3
    [osd.68]
    osd_numa_node=3
    [osd.69]
    osd_numa_node=3
    [osd.70]
    osd_numa_node=3
    [osd.71]
    osd_numa_node=3
    

    In the configuration file, osd_numa_node=0 indicates that the osd0 process is bound to NUMA0. This document uses the Kunpeng 920 7260 processor as an example. This model has four NUMA nodes, and 24 OSDs are deployed on each node. To better utilize hardware performance, you need to evenly bind OSD processes to CPU cores. In this case, every 6 consecutive OSDs are bound to a NUMA node . If the CPU in use has only two NUMA nodes, bind every 12 consecutive OSDs to a NUMA node. Other cases may be processed by analogy. You can run the lscpu command to check the number of CPU NUMA nodes.

  3. Synchronize the configurations to other nodes.
    1
    ceph-deploy --overwrite-conf admin ceph1 ceph2 ceph3
    
  4. Restart OSD processes on all server nodes to validate the configurations.
    1
    systemctl restart ceph-osd.target
    
Table 1 Parameters

Parameter

Description

Optimal Configuration

ms_bind_before_connect

Controls the timing of the binding behavior.

true

throttler_perf_counter

Collects statistics on the performance of the throttler used during OSD request processing.

false

bluefs_buffered_io

Configures buffered I/Os.

false

osd_max_write_size

Specifies the maximum data block size for OSD write operations.

256

osd_enable_op_tracker

Configures OSD monitoring and performance analysis enhancement.

false

rbd_cache

Configures the RBD cache.

false

rocksdb_cache_index_and_filter_blocks

Specifies whether to cache indexes and filters in blocks.

false

rocksdb_cache_size

Specifies the size of the RocksDB internal cache.

2 GB

osd_memory_cache_min

Specifies the minimum memory capacity used for caching when TCMalloc and automatic cache tuning are enabled.

3 GB

osd_memory_base

Specifies the estimated minimum memory required by OSDs when TCMalloc and automatic cache tuning are enabled.

3 GB

bluestore_rocksdb_options

Configures RocksDB options.

use_direct_reads=true,use_direct_io_for_flush_and_compaction=true,compression=kNoCompression,min_write_buffer_number_to_merge=32,recycle_log_file_num=64,compaction_style=kCompactionStyleLevel,write_buffer_size=64M,target_file_size_base=64M,compaction_threads=32,max_bytes_for_level_multiplier=8,flusher_threads=8,level0_file_num_compaction_trigger=16,level0_slowdown_writes_trigger=36,level0_stop_writes_trigger=48,compaction_readahead_size=524288,max_bytes_for_level_base=536870912,enable_pipelined_write=false,max_background_jobs=16,max_background_flushes=8,max_background_compactions=16,max_write_buffer_number=8,soft_pending_compaction_bytes_limit=137438953472,hard_pending_compaction_bytes_limit=274877906944,delayed_write_rate=33554432

osd_pg_object_context_cache_count

Specifies the number of cache entries of OSD object contexts.

256

mon_osd_full_ratio

Specifies the OSD full ratio. When the used cluster capacity reaches the specified ratio, the cluster stops data writing (but allows data reading), and enters the HEALTH_ERR state.

0.97

mon_osd_nearfull_ratio

Specifies the OSD nearfull ratio. When the used cluster capacity reaches the specified ratio, the cluster enters the HEALTH_WARN state.

0.95

osd_min_pg_log_entries

Specifies the minimum number of entries maintained in a PG log.

10

osd_max_pg_log_entries

Specifies the maximum number of entries maintained in a PG log.

10

bluestore_cache_meta_ratio

Specifies the ratio of cache allocated to metadata.

0.49

bluestore_cache_kv_ratio

Specifies the ratio of cache allocated to RocksDB's key-value store.

0.49

bluestore_cache_size_ssd

Specifies the BlueStore cache capacity.

6 GB

osd_memory_target

Specifies the maximum memory usage of an OSD.

10 GB

bluestore_clone_cow

Controls the behavior of copy-on-write (CoW) cloning. In CoW mode, the write performance is poor. The redirect-on-write (RoW) mode is recommended.

false