From 1384c2df90f1b581942ef1c5d58e675058e7dc6a Mon Sep 17 00:00:00 2001
From: Magnus Andersson <mandersson@sunet.se>
Date: Fri, 25 Oct 2024 13:18:48 +0200
Subject: [PATCH] Updated rook deployment for faliure zones

---
 k8s/rook/cluster-multizone.yaml       |  83 +++++++
 k8s/rook/cluster.yaml                 | 319 --------------------------
 k8s/rook/modules-load.d/rookceph.conf |   3 +
 tools/k8sconnectivitytest.sh          |   1 +
 4 files changed, 87 insertions(+), 319 deletions(-)
 create mode 100644 k8s/rook/cluster-multizone.yaml
 delete mode 100644 k8s/rook/cluster.yaml
 create mode 100644 k8s/rook/modules-load.d/rookceph.conf

diff --git a/k8s/rook/cluster-multizone.yaml b/k8s/rook/cluster-multizone.yaml
new file mode 100644
index 0000000..ba50479
--- /dev/null
+++ b/k8s/rook/cluster-multizone.yaml
@@ -0,0 +1,83 @@
+#################################################################################################################
+# Define the settings for the rook-ceph cluster with common settings for a production cluster.
+# Selected nodes with selected raw devices will be used for the Ceph cluster. At least three nodes are required
+# in this example. See the documentation for more details on storage settings available.
+
+# For example, to create the cluster:
+#   kubectl create -f crds.yaml -f common.yaml -f operator.yaml
+#   kubectl create -f cluster-multizone.yaml
+#################################################################################################################
+
+apiVersion: ceph.rook.io/v1
+kind: CephCluster
+metadata:
+  name: rook-ceph
+  namespace: rook-ceph # namespace:cluster
+spec:
+  dataDirHostPath: /var/lib/rook
+  mon:
+    count: 3
+    allowMultiplePerNode: false
+    failureDomainLabel: topology.kubernetes.io/zone 
+    zones:
+      - name: dco
+      - name: sto3
+      - name: sto4
+  mgr:
+    count: 2
+    modules:
+      - name: pg_autoscaler
+        enabled: true
+  cephVersion:
+    image: quay.io/ceph/ceph:v18.2.4
+    allowUnsupported: true
+  skipUpgradeChecks: false
+  continueUpgradeAfterChecksEvenIfNotHealthy: false
+  dashboard:
+    enabled: true
+    ssl: true
+  storage:
+    useAllNodes: false
+    nodes:
+      - name: k8sw1
+      - name: k8sw2
+      - name: k8sw3
+      - name: k8sw4
+      - name: k8sw5
+      - name: k8sw6
+    useAllDevices: false
+    devices:
+      - name: "/dev/rookvg/rookvol1"
+      - name: "/dev/rookvg/rookvol2"
+      - name: "/dev/rookvg/rookvol3"
+    deviceFilter: ""
+  placement:
+    osd:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: topology.kubernetes.io/zone
+                  operator: In
+                  values:
+                    - dco
+                    - sto3
+                    - sto4
+
+    mgr:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: topology.kubernetes.io/zone
+                  operator: In
+                  values:
+                    - dco
+                    - sto3
+                    - sto4
+  priorityClassNames:
+    mon: system-node-critical
+    osd: system-node-critical
+    mgr: system-cluster-critical
+  disruptionManagement:
+    managePodBudgets: true
diff --git a/k8s/rook/cluster.yaml b/k8s/rook/cluster.yaml
deleted file mode 100644
index d17e3a3..0000000
--- a/k8s/rook/cluster.yaml
+++ /dev/null
@@ -1,319 +0,0 @@
-#################################################################################################################
-# Define the settings for the rook-ceph cluster with common settings for a production cluster.
-# All nodes with available raw devices will be used for the Ceph cluster. At least three nodes are required
-# in this example. See the documentation for more details on storage settings available.
-
-# For example, to create the cluster:
-#   kubectl create -f crds.yaml -f common.yaml -f operator.yaml
-#   kubectl create -f cluster.yaml
-#################################################################################################################
-
-apiVersion: ceph.rook.io/v1
-kind: CephCluster
-metadata:
-  name: rook-ceph
-  namespace: rook-ceph # namespace:cluster
-spec:
-  cephVersion:
-    # The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
-    # v16 is Pacific, and v17 is Quincy.
-    # RECOMMENDATION: In production, use a specific version tag instead of the general v17 flag, which pulls the latest release and could result in different
-    # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
-    # If you want to be more precise, you can always use a timestamp tag such quay.io/ceph/ceph:v17.2.6-20230410
-    # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
-    image: quay.io/ceph/ceph:v17.2.6
-    # Whether to allow unsupported versions of Ceph. Currently `pacific` and `quincy` are supported.
-    # Future versions such as `reef` (v18) would require this to be set to `true`.
-    # Do not set to true in production.
-    allowUnsupported: false
-  # The path on the host where configuration files will be persisted. Must be specified.
-  # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
-  # In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
-  dataDirHostPath: /var/lib/rook
-  # Whether or not upgrade should continue even if a check fails
-  # This means Ceph's status could be degraded and we don't recommend upgrading but you might decide otherwise
-  # Use at your OWN risk
-  # To understand Rook's upgrade process of Ceph, read https://rook.io/docs/rook/latest/ceph-upgrade.html#ceph-version-upgrades
-  skipUpgradeChecks: false
-  # Whether or not continue if PGs are not clean during an upgrade
-  continueUpgradeAfterChecksEvenIfNotHealthy: false
-  # WaitTimeoutForHealthyOSDInMinutes defines the time (in minutes) the operator would wait before an OSD can be stopped for upgrade or restart.
-  # If the timeout exceeds and OSD is not ok to stop, then the operator would skip upgrade for the current OSD and proceed with the next one
-  # if `continueUpgradeAfterChecksEvenIfNotHealthy` is `false`. If `continueUpgradeAfterChecksEvenIfNotHealthy` is `true`, then operator would
-  # continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
-  # The default wait timeout is 10 minutes.
-  waitTimeoutForHealthyOSDInMinutes: 10
-  mon:
-    # Set the number of mons to be started. Generally recommended to be 3.
-    # For highest availability, an odd number of mons should be specified.
-    count: 3
-    # The mons should be on unique nodes. For production, at least 3 nodes are recommended for this reason.
-    # Mons should only be allowed on the same node for test environments where data loss is acceptable.
-    allowMultiplePerNode: false
-  mgr:
-    # When higher availability of the mgr is needed, increase the count to 2.
-    # In that case, one mgr will be active and one in standby. When Ceph updates which
-    # mgr is active, Rook will update the mgr services to match the active mgr.
-    count: 2
-    allowMultiplePerNode: false
-    modules:
-      # Several modules should not need to be included in this list. The "dashboard" and "monitoring" modules
-      # are already enabled by other settings in the cluster CR.
-      - name: pg_autoscaler
-        enabled: true
-  # enable the ceph dashboard for viewing cluster status
-  dashboard:
-    enabled: true
-    # serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)
-    # urlPrefix: /ceph-dashboard
-    # serve the dashboard at the given port.
-    # port: 8443
-    # serve the dashboard using SSL
-    ssl: true
-  # enable prometheus alerting for cluster
-  monitoring:
-    # requires Prometheus to be pre-installed
-    enabled: false
-    # Whether to disable the metrics reported by Ceph. If false, the prometheus mgr module and Ceph exporter are enabled.
-    # If true, the prometheus mgr module and Ceph exporter are both disabled. Default is false.
-    metricsDisabled: false
-  network:
-    connections:
-      # Whether to encrypt the data in transit across the wire to prevent eavesdropping the data on the network.
-      # The default is false. When encryption is enabled, all communication between clients and Ceph daemons, or between Ceph daemons will be encrypted.
-      # When encryption is not enabled, clients still establish a strong initial authentication and data integrity is still validated with a crc check.
-      # IMPORTANT: Encryption requires the 5.11 kernel for the latest nbd and cephfs drivers. Alternatively for testing only,
-      # you can set the "mounter: rbd-nbd" in the rbd storage class, or "mounter: fuse" in the cephfs storage class.
-      # The nbd and fuse drivers are *not* recommended in production since restarting the csi driver pod will disconnect the volumes.
-      encryption:
-        enabled: false
-      # Whether to compress the data in transit across the wire. The default is false.
-      # Requires Ceph Quincy (v17) or newer. Also see the kernel requirements above for encryption.
-      compression:
-        enabled: false
-      # Whether to require communication over msgr2. If true, the msgr v1 port (6789) will be disabled
-      # and clients will be required to connect to the Ceph cluster with the v2 port (3300).
-      # Requires a kernel that supports msgr v2 (kernel 5.11 or CentOS 8.4 or newer).
-      requireMsgr2: false
-    # enable host networking
-    #provider: host
-    # enable the Multus network provider
-    #provider: multus
-    #selectors:
-      # The selector keys are required to be `public` and `cluster`.
-      # Based on the configuration, the operator will do the following:
-      #   1. if only the `public` selector key is specified both public_network and cluster_network Ceph settings will listen on that interface
-      #   2. if both `public` and `cluster` selector keys are specified the first one will point to 'public_network' flag and the second one to 'cluster_network'
-      #
-      # In order to work, each selector value must match a NetworkAttachmentDefinition object in Multus
-      #
-      #public: public-conf --> NetworkAttachmentDefinition object name in Multus
-      #cluster: cluster-conf --> NetworkAttachmentDefinition object name in Multus
-    # Provide internet protocol version. IPv6, IPv4 or empty string are valid options. Empty string would mean IPv4
-    #ipFamily: "IPv6"
-    # Ceph daemons to listen on both IPv4 and Ipv6 networks
-    #dualStack: false
-    # Enable multiClusterService to export the mon and OSD services to peer cluster.
-    # This is useful to support RBD mirroring between two clusters having overlapping CIDRs.
-    # Ensure that peer clusters are connected using an MCS API compatible application, like Globalnet Submariner.
-    #multiClusterService:
-    #  enabled: false
-
-  # enable the crash collector for ceph daemon crash collection
-  crashCollector:
-    disable: false
-    # Uncomment daysToRetain to prune ceph crash entries older than the
-    # specified number of days.
-    #daysToRetain: 30
-  # enable log collector, daemons will log on files and rotate
-  logCollector:
-    enabled: true
-    periodicity: daily # one of: hourly, daily, weekly, monthly
-    maxLogSize: 500M # SUFFIX may be 'M' or 'G'. Must be at least 1M.
-  # automate [data cleanup process](https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/ceph-teardown.md#delete-the-data-on-hosts) in cluster destruction.
-  cleanupPolicy:
-    # Since cluster cleanup is destructive to data, confirmation is required.
-    # To destroy all Rook data on hosts during uninstall, confirmation must be set to "yes-really-destroy-data".
-    # This value should only be set when the cluster is about to be deleted. After the confirmation is set,
-    # Rook will immediately stop configuring the cluster and only wait for the delete command.
-    # If the empty string is set, Rook will not destroy any data on hosts during uninstall.
-    confirmation: ""
-    # sanitizeDisks represents settings for sanitizing OSD disks on cluster deletion
-    sanitizeDisks:
-      # method indicates if the entire disk should be sanitized or simply ceph's metadata
-      # in both case, re-install is possible
-      # possible choices are 'complete' or 'quick' (default)
-      method: quick
-      # dataSource indicate where to get random bytes from to write on the disk
-      # possible choices are 'zero' (default) or 'random'
-      # using random sources will consume entropy from the system and will take much more time then the zero source
-      dataSource: zero
-      # iteration overwrite N times instead of the default (1)
-      # takes an integer value
-      iteration: 1
-    # allowUninstallWithVolumes defines how the uninstall should be performed
-    # If set to true, cephCluster deletion does not wait for the PVs to be deleted.
-    allowUninstallWithVolumes: false
-  # To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
-  # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
-  # tolerate taints with a key of 'storage-node'.
-  # placement:
-  #   all:
-  #     nodeAffinity:
-  #       requiredDuringSchedulingIgnoredDuringExecution:
-  #         nodeSelectorTerms:
-  #         - matchExpressions:
-  #           - key: role
-  #             operator: In
-  #             values:
-  #             - storage-node
-  #     podAffinity:
-  #     podAntiAffinity:
-  #     topologySpreadConstraints:
-  #     tolerations:
-  #     - key: storage-node
-  #       operator: Exists
-  # The above placement information can also be specified for mon, osd, and mgr components
-  #   mon:
-  # Monitor deployments may contain an anti-affinity rule for avoiding monitor
-  # collocation on the same node. This is a required rule when host network is used
-  # or when AllowMultiplePerNode is false. Otherwise this anti-affinity rule is a
-  # preferred rule with weight: 50.
-  #   osd:
-  #    prepareosd:
-  #    mgr:
-  #    cleanup:
-  annotations:
-  #   all:
-  #   mon:
-  #   osd:
-  #   cleanup:
-  #   prepareosd:
-  # clusterMetadata annotations will be applied to only `rook-ceph-mon-endpoints` configmap and the `rook-ceph-mon` and `rook-ceph-admin-keyring` secrets.
-  # And clusterMetadata annotations will not be merged with `all` annotations.
-  #    clusterMetadata:
-  #       kubed.appscode.com/sync: "true"
-  # If no mgr annotations are set, prometheus scrape annotations will be set by default.
-  #   mgr:
-  labels:
-  #   all:
-  #   mon:
-  #   osd:
-  #   cleanup:
-  #   mgr:
-  #   prepareosd:
-  # monitoring is a list of key-value pairs. It is injected into all the monitoring resources created by operator.
-  # These labels can be passed as LabelSelector to Prometheus
-  #   monitoring:
-  #   crashcollector:
-  resources:
-  #The requests and limits set here, allow the mgr pod to use half of one CPU core and 1 gigabyte of memory
-  #   mgr:
-  #     limits:
-  #       cpu: "500m"
-  #       memory: "1024Mi"
-  #     requests:
-  #       cpu: "500m"
-  #       memory: "1024Mi"
-  # The above example requests/limits can also be added to the other components
-  #   mon:
-  #   osd:
-  # For OSD it also is a possible to specify requests/limits based on device class
-  #   osd-hdd:
-  #   osd-ssd:
-  #   osd-nvme:
-  #   prepareosd:
-  #   mgr-sidecar:
-  #   crashcollector:
-  #   logcollector:
-  #   cleanup:
-  #   exporter:
-  # The option to automatically remove OSDs that are out and are safe to destroy.
-  removeOSDsIfOutAndSafeToRemove: false
-  priorityClassNames:
-    #all: rook-ceph-default-priority-class
-    mon: system-node-critical
-    osd: system-node-critical
-    mgr: system-cluster-critical
-    #crashcollector: rook-ceph-crashcollector-priority-class
-  storage: # cluster level storage configuration and selection
-    useAllNodes: false
-    nodes:
-      - name: k8sw1
-      - name: k8sw2
-      - name: k8sw3
-      - name: k8sw4
-      - name: k8sw5
-      - name: k8sw6
-
-    useAllDevices: false
-
-    devices:
-      - name: "/dev/rookvg/rookvol1"
-      - name: "/dev/rookvg/rookvol2"
-      - name: "/dev/rookvg/rookvol3"
-    config:
-      # crushRoot: "custom-root" # specify a non-default root label for the CRUSH map
-      # metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
-      # databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
-      # journalSizeMB: "1024"  # uncomment if the disks are 20 GB or smaller
-      # osdsPerDevice: "1" # this value can be overridden at the node or device level
-      # encryptedDevice: "true" # the default value for this option is "false"
-    # Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
-    # nodes below will be used as storage resources.  Each node's 'name' field should match their 'kubernetes.io/hostname' label.
-    # nodes:
-    #   - name: "172.17.4.201"
-    #     devices: # specific devices to use for storage can be specified for each node
-    #       - name: "sdb"
-    #       - name: "nvme01" # multiple osds can be created on high performance devices
-    #         config:
-    #           osdsPerDevice: "5"
-    #       - name: "/dev/disk/by-id/ata-ST4000DM004-XXXX" # devices can be specified using full udev paths
-    #     config: # configuration can be specified at the node level which overrides the cluster level config
-    #   - name: "172.17.4.301"
-    #     deviceFilter: "^sd."
-    # when onlyApplyOSDPlacement is false, will merge both placement.All() and placement.osd
-    onlyApplyOSDPlacement: false
-  # The section for configuring management of daemon disruptions during upgrade or fencing.
-  disruptionManagement:
-    # If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically
-    # via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will
-    # block eviction of OSDs by default and unblock them safely when drains are detected.
-    managePodBudgets: true
-    # A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
-    # default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
-    osdMaintenanceTimeout: 30
-    # A duration in minutes that the operator will wait for the placement groups to become healthy (active+clean) after a drain was completed and OSDs came back up.
-    # Operator will continue with the next drain if the timeout exceeds. It only works if `managePodBudgets` is `true`.
-    # No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
-    pgHealthCheckTimeout: 0
-
-  # healthChecks
-  # Valid values for daemons are 'mon', 'osd', 'status'
-  healthCheck:
-    daemonHealth:
-      mon:
-        disabled: false
-        interval: 45s
-      osd:
-        disabled: false
-        interval: 60s
-      status:
-        disabled: false
-        interval: 60s
-    # Change pod liveness probe timing or threshold values. Works for all mon,mgr,osd daemons.
-    livenessProbe:
-      mon:
-        disabled: false
-      mgr:
-        disabled: false
-      osd:
-        disabled: false
-    # Change pod startup probe timing or threshold values. Works for all mon,mgr,osd daemons.
-    startupProbe:
-      mon:
-        disabled: false
-      mgr:
-        disabled: false
-      osd:
-        disabled: false
diff --git a/k8s/rook/modules-load.d/rookceph.conf b/k8s/rook/modules-load.d/rookceph.conf
new file mode 100644
index 0000000..7518225
--- /dev/null
+++ b/k8s/rook/modules-load.d/rookceph.conf
@@ -0,0 +1,3 @@
+nbd
+rbd
+ceph
diff --git a/tools/k8sconnectivitytest.sh b/tools/k8sconnectivitytest.sh
index d9c73a4..80da43c 100755
--- a/tools/k8sconnectivitytest.sh
+++ b/tools/k8sconnectivitytest.sh
@@ -19,6 +19,7 @@ portsc['10257']='tcp'
 portsc['10259']='tcp'
 portsc['12379']='tcp'
 portsc['16443']='tcp'
+portsc['19001']='tcp'
 portsc['25000']='tcp'
 portsc['51820']='udp'