Hi,

We have a CephFS Quincy cluster (17.2.7) used by Openshift for PVC provisioning (and lot of snapshots) with ceph-csi driver. On the metadata pool, we observed a continuous increase of the write throughput activity from 10MB/s just after a restart of the MDS to +200MB/s after 2 weeks.

We first thought it was related to the snapshots deletion activity (lot of creations and deletions every day) because we saw the same kind of increase for the number of Strays. We tried to evaluate strays using the recursive scrub [1], Strays decrease from 450K to 350K but no impact on the metadata write throughput.
Only a MDS restart makes the write throughput back to normal.

I saw this very familiar issue https://tracker.ceph.com/issues/53542 and I'm wondering if this has been fixed in Quincy? I tried to increase "mds_log_events_per_segment" and "mds_log_max_segments" but nothing helped. As I answered in the tracker, I also observed large 4M objects in objecter_requests.

Anything we can do to prevent this and avoid restarting the MDS every week? If we don't do that, PVC operations stay blocked on Openshift side.

Cheers,
Adrien

[1] https://docs.ceph.com/en/quincy/cephfs/scrub/#evaluate-strays-using-recursive-scrub

The MDS perf dump if it can helps :

{
    "AsyncMessenger::Worker-0": {
        "msgr_recv_messages": 452889868,
        "msgr_send_messages": 210932005,
        "msgr_recv_bytes": 381554341884,
        "msgr_send_bytes": 31897539519042,
        "msgr_created_connections": 5392,
        "msgr_active_connections": 213,
        "msgr_running_total_time": 121964.941943159,
        "msgr_running_send_time": 85193.342708858,
        "msgr_running_recv_time": 40274.249544605,
        "msgr_running_fast_dispatch_time": 15899.702855460,
        "msgr_send_messages_queue_lat": {
            "avgcount": 210932003,
            "sum": 140532.121408536,
            "avgtime": 0.000666243
        },
        "msgr_handle_ack_lat": {
            "avgcount": 110313532,
            "sum": 120.513184629,
            "avgtime": 0.000001092
        }
    },
    "AsyncMessenger::Worker-1": {
        "msgr_recv_messages": 393334601,
        "msgr_send_messages": 170841678,
        "msgr_recv_bytes": 307649876710,
        "msgr_send_bytes": 23017109865662,
        "msgr_created_connections": 12712,
        "msgr_active_connections": 218,
        "msgr_running_total_time": 93136.265793790,
        "msgr_running_send_time": 63604.143090523,
        "msgr_running_recv_time": 30881.136884349,
        "msgr_running_fast_dispatch_time": 12763.126679902,
        "msgr_send_messages_queue_lat": {
            "avgcount": 170841678,
            "sum": 153574.265735747,
            "avgtime": 0.000898927
        },
        "msgr_handle_ack_lat": {
            "avgcount": 81594240,
            "sum": 65.480936528,
            "avgtime": 0.000000802
        }
    },
    "AsyncMessenger::Worker-2": {
        "msgr_recv_messages": 481956104,
        "msgr_send_messages": 301348856,
        "msgr_recv_bytes": 349838769013,
        "msgr_send_bytes": 26783354654792,
        "msgr_created_connections": 34394,
        "msgr_active_connections": 215,
        "msgr_running_total_time": 108807.807447203,
        "msgr_running_send_time": 74577.666991790,
        "msgr_running_recv_time": 41392.667149426,
        "msgr_running_fast_dispatch_time": 13971.142479134,
        "msgr_send_messages_queue_lat": {
            "avgcount": 301348847,
            "sum": 138655.862052108,
            "avgtime": 0.000460117
        },
        "msgr_handle_ack_lat": {
            "avgcount": 156375610,
            "sum": 144.168097813,
            "avgtime": 0.000000921
        }
    },
    "cct": {
        "total_workers": 1,
        "unhealthy_workers": 0
    },
    "finisher-MDSRank": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 121564865,
            "sum": 160601.142778677,
            "avgtime": 0.001321114
        }
    },
    "finisher-PurgeQueue": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 27338289,
            "sum": 8785.146872773,
            "avgtime": 0.000321349
        }
    },
    "mds": {
        "request": 210298249,
        "reply": 210297956,
        "reply_latency": {
            "avgcount": 210297956,
            "sum": 1148337.019419176,
            "avgtime": 0.005460523
        },
        "slow_reply": 15,
        "forward": 0,
        "dir_fetch": 49843551,
        "dir_commit": 65281430,
        "dir_split": 6,
        "dir_merge": 17,
        "inodes": 5113817,
        "inodes_top": 2080514,
        "inodes_bottom": 2263929,
        "inodes_pin_tail": 769374,
        "inodes_pinned": 2141653,
        "inodes_expired": 538454667,
        "inodes_with_caps": 413837,
        "caps": 457168,
        "subtrees": 2,
        "traverse": 262113637,
        "traverse_hit": 245551660,
        "traverse_forward": 0,
        "traverse_discover": 0,
        "traverse_dir_fetch": 1642726,
        "traverse_remote_ino": 0,
        "traverse_lock": 140893,
        "load_cent": 392174,
        "q": 0,
        "exported": 0,
        "exported_inodes": 0,
        "imported": 0,
        "imported_inodes": 0,
        "openino_dir_fetch": 407633,
        "openino_backtrace_fetch": 7011755,
        "openino_peer_discover": 0,
        "root_rfiles": 11618539,
        "root_rbytes": 11106041675168,
        "root_rsnaps": 7576,
        "scrub_backtrace_fetch": 7009788,
        "scrub_set_tag": 0,
        "scrub_backtrace_repaired": 0,
        "scrub_inotable_repaired": 0,
        "scrub_dir_inodes": 1526050,
        "scrub_dir_base_inodes": 3,
        "scrub_dirfrag_rstats": 1526047,
        "scrub_file_inodes": 5483733,
        "handle_inode_file_caps": 0,
        "ceph_cap_op_revoke": 5076186,
        "ceph_cap_op_grant": 60192925,
        "ceph_cap_op_trunc": 658164,
        "ceph_cap_op_flushsnap_ack": 0,
        "ceph_cap_op_flush_ack": 6,
        "handle_client_caps": 70349707,
        "handle_client_caps_dirty": 21547109,
        "handle_client_cap_release": 11468771,
        "process_request_cap_release": 83631146
    },
    "mds_cache": {
        "num_strays": 347989,
        "num_strays_delayed": 0,
        "num_strays_enqueuing": 0,
        "strays_created": 19259955,
        "strays_enqueued": 20275628,
        "strays_reintegrated": 469,
        "strays_migrated": 0,
        "num_recovering_processing": 0,
        "num_recovering_enqueued": 0,
        "num_recovering_prioritized": 0,
        "recovery_started": 484,
        "recovery_completed": 484,
        "ireq_enqueue_scrub": 2,
        "ireq_exportdir": 0,
        "ireq_flush": 0,
        "ireq_fragmentdir": 23,
        "ireq_fragstats": 0,
        "ireq_inodestats": 0
    },
    "mds_log": {
        "evadd": 148604638,
        "evex": 148687537,
        "evtrm": 148686957,
        "ev": 4640,
        "evexg": 0,
        "evexd": 580,
        "segadd": 5143165,
        "segex": 5143142,
        "segtrm": 5143128,
        "seg": 166,
        "segexg": 0,
        "segexd": 14,
        "expos": 2655552712972056,
        "wrpos": 2655556750517207,
        "rdpos": 2574717722176006,
        "jlat": {
            "avgcount": 23948614,
            "sum": 1026045.477642289,
            "avgtime": 0.042843626
        },
        "replayed": 86959
    },
    "mds_mem": {
        "ino": 5091233,
        "ino+": 537662754,
        "ino-": 532571521,
        "dir": 385511,
        "dir+": 48473070,
        "dir-": 48087559,
        "dn": 5114534,
        "dn+": 564239550,
        "dn-": 559125016,
        "cap": 457198,
        "cap+": 296065469,
        "cap-": 295608271,
        "rss": 21947020,
        "heap": 223516
    },
    "mds_server": {
        "dispatch_client_request": 265192774,
        "dispatch_server_request": 0,
        "handle_client_request": 210298249,
        "handle_client_session": 44456292,
        "handle_peer_request": 0,
        "req_create_latency": {
            "avgcount": 19045215,
            "sum": 243874.341572325,
            "avgtime": 0.012805019
        },
        "req_getattr_latency": {
            "avgcount": 13244393,
            "sum": 122372.637049784,
            "avgtime": 0.009239580
        },
        "req_getfilelock_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_link_latency": {
            "avgcount": 108,
            "sum": 0.089209875,
            "avgtime": 0.000826017
        },
        "req_lookup_latency": {
            "avgcount": 42591076,
            "sum": 61153.611919024,
            "avgtime": 0.001435831
        },
        "req_lookuphash_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_lookupino_latency": {
            "avgcount": 6119,
            "sum": 12.051986148,
            "avgtime": 0.001969600
        },
        "req_lookupname_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_lookupparent_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_lookupsnap_latency": {
            "avgcount": 11789,
            "sum": 1.916955778,
            "avgtime": 0.000162605
        },
        "req_lssnap_latency": {
            "avgcount": 174999,
            "sum": 59.628852495,
            "avgtime": 0.000340738
        },
        "req_mkdir_latency": {
            "avgcount": 386018,
            "sum": 5323.027974258,
            "avgtime": 0.013789584
        },
        "req_mknod_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_mksnap_latency": {
            "avgcount": 11784,
            "sum": 4342.774640033,
            "avgtime": 0.368531452
        },
        "req_open_latency": {
            "avgcount": 1719118,
            "sum": 2957.961208770,
            "avgtime": 0.001720627
        },
        "req_readdir_latency": {
            "avgcount": 49303877,
            "sum": 153658.488737674,
            "avgtime": 0.003116559
        },
        "req_rename_latency": {
            "avgcount": 1021332,
            "sum": 4173.200914205,
            "avgtime": 0.004086037
        },
        "req_renamesnap_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_rmdir_latency": {
            "avgcount": 355348,
            "sum": 6609.629921457,
            "avgtime": 0.018600442
        },
        "req_rmsnap_latency": {
            "avgcount": 11476,
            "sum": 4647.171801363,
            "avgtime": 0.404947002
        },
        "req_rmxattr_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_setattr_latency": {
            "avgcount": 951092,
            "sum": 60918.695526648,
            "avgtime": 0.064051317
        },
        "req_setdirlayout_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_setfilelock_latency": {
            "avgcount": 42236246,
            "sum": 87407.285106607,
            "avgtime": 0.002069485
        },
        "req_setlayout_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        },
        "req_setxattr_latency": {
            "avgcount": 20450454,
            "sum": 351779.625311317,
            "avgtime": 0.017201555
        },
        "req_symlink_latency": {
            "avgcount": 2,
            "sum": 0.002286490,
            "avgtime": 0.001143245
        },
        "req_unlink_latency": {
            "avgcount": 18777510,
            "sum": 39044.878444925,
            "avgtime": 0.002079342
        },
        "cap_revoke_eviction": 0,
        "cap_acquisition_throttle": 0,
        "req_getvxattr_latency": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "mds_sessions": {
        "session_count": 545,
        "session_add": 26598,
        "session_remove": 26053,
        "sessions_open": 545,
        "sessions_stale": 0,
        "total_load": 11076,
        "average_load": 20,
        "avg_session_uptime": 37409721,
        "mdthresh_evicted": 0
    },
    "mempool": {
        "bloom_filter_bytes": 10309112,
        "bloom_filter_items": 10309112,
        "bluestore_alloc_bytes": 0,
        "bluestore_alloc_items": 0,
        "bluestore_cache_data_bytes": 0,
        "bluestore_cache_data_items": 0,
        "bluestore_cache_onode_bytes": 0,
        "bluestore_cache_onode_items": 0,
        "bluestore_cache_meta_bytes": 0,
        "bluestore_cache_meta_items": 0,
        "bluestore_cache_other_bytes": 0,
        "bluestore_cache_other_items": 0,
        "bluestore_Buffer_bytes": 0,
        "bluestore_Buffer_items": 0,
        "bluestore_Extent_bytes": 0,
        "bluestore_Extent_items": 0,
        "bluestore_Blob_bytes": 0,
        "bluestore_Blob_items": 0,
        "bluestore_SharedBlob_bytes": 0,
        "bluestore_SharedBlob_items": 0,
        "bluestore_inline_bl_bytes": 0,
        "bluestore_inline_bl_items": 0,
        "bluestore_fsck_bytes": 0,
        "bluestore_fsck_items": 0,
        "bluestore_txc_bytes": 0,
        "bluestore_txc_items": 0,
        "bluestore_writing_deferred_bytes": 0,
        "bluestore_writing_deferred_items": 0,
        "bluestore_writing_bytes": 0,
        "bluestore_writing_items": 0,
        "bluefs_bytes": 0,
        "bluefs_items": 0,
        "bluefs_file_reader_bytes": 0,
        "bluefs_file_reader_items": 0,
        "bluefs_file_writer_bytes": 0,
        "bluefs_file_writer_items": 0,
        "buffer_anon_bytes": 284731060,
        "buffer_anon_items": 5959770,
        "buffer_meta_bytes": 616,
        "buffer_meta_items": 7,
        "osd_bytes": 0,
        "osd_items": 0,
        "osd_mapbl_bytes": 0,
        "osd_mapbl_items": 0,
        "osd_pglog_bytes": 0,
        "osd_pglog_items": 0,
        "osdmap_bytes": 81480,
        "osdmap_items": 2396,
        "osdmap_mapping_bytes": 0,
        "osdmap_mapping_items": 0,
        "pgmap_bytes": 0,
        "pgmap_items": 0,
        "mds_co_bytes": 16277298996,
        "mds_co_items": 260156681,
        "unittest_1_bytes": 0,
        "unittest_1_items": 0,
        "unittest_2_bytes": 0,
        "unittest_2_items": 0
    },
    "objecter": {
        "op_active": 11,
        "op_laggy": 0,
        "op_send": 253406063,
        "op_send_bytes": 81014361089067,
        "op_resend": 0,
        "op_reply": 253406052,
        "oplen_avg": {
            "avgcount": 253406063,
            "sum": 451959672
        },
        "op": 253406063,
        "op_r": 58269533,
        "op_w": 195136530,
        "op_rmw": 0,
        "op_pg": 0,
        "osdop_stat": 64974069,
        "osdop_create": 11505623,
        "osdop_read": 1275145,
        "osdop_write": 34882891,
        "osdop_writefull": 1169760,
        "osdop_writesame": 0,
        "osdop_append": 0,
        "osdop_zero": 2,
        "osdop_truncate": 4,
        "osdop_delete": 79836916,
        "osdop_mapext": 0,
        "osdop_sparse_read": 0,
        "osdop_clonerange": 0,
        "osdop_getxattr": 56700621,
        "osdop_setxattr": 17474636,
        "osdop_cmpxattr": 0,
        "osdop_rmxattr": 0,
        "osdop_resetxattrs": 0,
        "osdop_call": 0,
        "osdop_watch": 0,
        "osdop_notify": 0,
        "osdop_src_cmpxattr": 0,
        "osdop_pgls": 0,
        "osdop_pgls_filter": 0,
        "osdop_other": 683839,
        "linger_active": 0,
        "linger_send": 0,
        "linger_resend": 0,
        "linger_ping": 0,
        "poolop_active": 0,
        "poolop_send": 0,
        "poolop_resend": 0,
        "poolstat_active": 0,
        "poolstat_send": 0,
        "poolstat_resend": 0,
        "statfs_active": 0,
        "statfs_send": 0,
        "statfs_resend": 0,
        "command_active": 0,
        "command_send": 0,
        "command_resend": 0,
        "map_epoch": 685274,
        "map_full": 0,
        "map_inc": 19880,
        "osd_sessions": 96,
        "osd_session_open": 96,
        "osd_session_close": 0,
        "osd_laggy": 0,
        "omap_wr": 78737471,
        "omap_rd": 99825478,
        "omap_del": 4893217
    },
    "oft": {
        "omap_total_objs": 4,
        "omap_total_kv_pairs": 105334,
        "omap_total_updates": 80286291,
        "omap_total_removes": 72648222
    },
    "purge_queue": {
        "pq_executing_ops": 0,
        "pq_executing_ops_high_water": 1559,
        "pq_executing": 0,
        "pq_executing_high_water": 64,
        "pq_executed": 20275628,
        "pq_item_in_journal": 0
    },
    "throttle-msgr_dispatch_throttler-mds": {
        "val": 0,
        "max": 104857600,
        "get_started": 0,
        "get": 1328180571,
        "get_sum": 935681276002,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 1328180571,
        "take": 0,
        "take_sum": 0,
        "put": 1328180571,
        "put_sum": 935681276002,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-objecter_bytes": {
        "val": 21734389,
        "max": 104857600,
        "get_started": 0,
        "get": 0,
        "get_sum": 0,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 0,
        "take": 253406063,
        "take_sum": 81151894362586,
        "put": 253406052,
        "put_sum": 81151872628197,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-objecter_ops": {
        "val": 11,
        "max": 1024,
        "get_started": 0,
        "get": 0,
        "get_sum": 0,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 0,
        "take": 253406063,
        "take_sum": 253406063,
        "put": 253406052,
        "put_sum": 253406052,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-write_buf_throttle": {
        "val": 0,
        "max": 3758096384,
        "get_started": 0,
        "get": 20275628,
        "get_sum": 2230259884,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 20275628,
        "take": 0,
        "take_sum": 0,
        "put": 1242717,
        "put_sum": 2230259884,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-write_buf_throttle-0x56401344c0a0": {
        "val": 0,
        "max": 3758096384,
        "get_started": 0,
        "get": 148604638,
        "get_sum": 80839025768070,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 148604638,
        "take": 0,
        "take_sum": 0,
        "put": 23948617,
        "put_sum": 80839025768070,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    }
}
_______________________________________________
ceph-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to