This commit is contained in:
Marcel Wefers 2024-04-08 09:37:01 +02:00
commit f29f47ba64
82 changed files with 61981 additions and 0 deletions

3
.facts/localhost Normal file
View File

@ -0,0 +1,3 @@
{
"discovered_interpreter_python": "/opt/homebrew/bin/python3.12"
}

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/templates" />
</list>
</option>
</component>
</module>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.9 (masasana_cloud)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (masasana_cloud)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/kubernetes_ansible.iml" filepath="$PROJECT_DIR$/.idea/kubernetes_ansible.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

19
ansible.cfg Normal file
View File

@ -0,0 +1,19 @@
[defaults]
inventory = inventory.yml
# ToDo: Wenn ACN genutzt wird, muss der ssh Key hier angegeben werden
# private_key_file = /root/.ssh/id_rsa_sc_admin
remote_user = root
ansible_ssh_common_args = '-o StrictHostKeyChecking=no'
host_key_checking = False
gathering = smart
fact_caching = jsonfile
fact_caching_connection = .facts
fact_caching_timeout = 0
stdout_callback = yaml
ansible_python_interpreter = '/usr/bin/python3'
[inventory]
enable_plugins=host_list, script, auto, yaml, ini, toml, kubernetes.core.k8s
[ssh_connection]
scp_if_ssh = True

41
inventory.yml Normal file
View File

@ -0,0 +1,41 @@
###########################################
# Masasana Cloud PLatform Inventory
###########################################
all:
children:
kubernetes:
children:
controller:
children:
controller_init:
hosts:
k8s-master-1:
ansible_host: 10.0.0.2
ansible_python_interpreter: /usr/bin/python3.11
ansible_ssh_common_args: '-o ProxyCommand="ssh -p 22 -W %h:%p -q root@65.109.4.220"'
worker:
hosts:
k8s-node-1:
ansible_host: 10.0.0.3
ansible_python_interpreter: /usr/bin/python3.11
ansible_ssh_common_args: '-o ProxyCommand="ssh -p 22 -W %h:%p -q root@65.109.4.220"'
nat:
hosts:
nat-gateway:
ansible_host: 65.109.4.220
ansible_python_interpreter: /usr/bin/python3.11
ansible_ssh_common_args: '-o ProxyCommand="ssh -p 22 -W %h:%p -q root@65.109.4.220"'
kubernetes_api:
hosts:
k8s-api:
ansible_host: 65.109.222.158
ansible_python_interpreter: /usr/bin/python3.11
local:
hosts:
localhost:
ansible_host: 127.0.0.1
ansible_connection: local

11
playbook.yml Normal file
View File

@ -0,0 +1,11 @@
######################################
# Main Playbook #
######################################
---
- hosts: localhost
gather_facts: false
connection: local
- import_playbook: playbooks/setup_hetzner_nat.yml
- import_playbook: playbooks/setup_k8s_cluster.yml

View File

@ -0,0 +1,19 @@
######################################
# Prepare Hetzner NAT-Gateway #
######################################
---
- hosts: controller, worker, nat, local
gather_facts: false
tasks:
- name: Generate ssh key pair local
when: inventory_hostname in groups ['local']
import_tasks: ../tasks/hetzner_nat/generate_ssh_key.yml
- name: Configuration of NAT-Server
when: inventory_hostname in groups ['nat']
import_tasks: ../tasks/hetzner_nat/configuration_nat.yml
- name: Configuration of Client-Server (controller and worker)
when: inventory_hostname in groups ['controller'] or inventory_hostname in groups ['worker']
import_tasks: ../tasks/hetzner_nat/configuration_client.yml

View File

@ -0,0 +1,48 @@
######################################
# Prepare Linux and install k8s #
######################################
---
- hosts: controller, worker , proxmox
gather_facts: false
tasks:
- name: Read global default values
include_vars:
file: ../vars/default.yml
- name: Read cluster default values
include_vars:
file: ../vars/k8s_cluster/cluster_vars.yml
- name: base install block
when: inventory_hostname in (groups['controller'] + groups['worker'])
block:
- name: Prepare Debian System
import_tasks: ../tasks/k8s_cluster/system/prepare_debian_system.yml
- name: Setup k8s Cluster
import_tasks: ../tasks/k8s_cluster/kubernetes/setupK8sCluster.yml
- name : Install linkerd Service Mesh
import_tasks: ../tasks/k8s_cluster/serviceMesh/installLinkerd.yml
- name: Setup Storage
import_tasks: ../tasks/k8s_cluster/storage/setupStorage.yml
- name: base install block
when: inventory_hostname in groups['controller']
block:
- name: install Helm3
import_tasks: ../tasks/k8s_cluster/helm/install_helm3.yml
- name: base install block controller_init
when: inventory_hostname in groups['controller_init']
block:
- name: Install MetalLB
import_tasks: ../tasks/k8s_cluster/loadbalancer/install_metallb.yml
- name: install nginx ingress
import_tasks: ../tasks/k8s_cluster/ingress/install_nginx_ingress.yml
- name: Install cert-manager
import_tasks: ../tasks/k8s_cluster/cert_manager/install_cert_manager.yml

View File

@ -0,0 +1,31 @@
######################################
# Configuration Client-Server #
######################################
---
- name: Edit Network interface file on Client Server
blockinfile:
path: /etc/network/interfaces
block: |
auto enp7s0
iface enp7s0 inet dhcp
post-up ip route add default via 10.0.0.1
- name: Add Hetzner Nameserver
blockinfile:
path: /etc/resolvconf/resolv.conf.d/head
block: |
nameserver 8.8.8.8
nameserver 8.8.4.4
- name: Enable Updates for resolvconf
raw: "resolvconf --enable-updates"
- name: Update resolvconf
raw: "resolvconf -u"
- name: Reboot Clients
reboot:

View File

@ -0,0 +1,21 @@
######################################
# Configuration NAT-Server #
######################################
---
- name: Copy Public ssh-key and paste to NAT-Server
copy:
src: /tmp/id_rsa.pub
dest: ~/.ssh/
- name: Change Permission of the Private ssh-key only to read for the User
raw: "chmod 0400 ~/.ssh/id_rsa"
- name: Edit Network interface file on NAT-Server
blockinfile:
path: /etc/network/interfaces
block: |
auto enp7s0
iface enp7s0 inet dhcp
post-up echo 1 > /proc/sys/net/ipv4/ip_forward
post-up iptables -t nat -A POSTROUTING -s '10.0.0.0/16' -o enp7s0 -j MASQUERADE

View File

@ -0,0 +1,8 @@
######################################
# Generate SSH-Key #
######################################
---
- name: Generate an OpenSSH keypair local
community.crypto.openssh_keypair:
path: /tmp/id_rsa
type: rsa

View File

@ -0,0 +1,5 @@
---
- name: Deploy example-app from manifest
kubernetes.core.k8s:
state: present
definition: "{{ lookup('file', './manifests/example-app/deploy.yml') | from_yaml_all }}"

View File

@ -0,0 +1,52 @@
######################################
# Install cert-manager in cluster #
######################################
---
- name: Read cert-manager values
include_vars:
file: ../vars/k8s_cluster/cert_manager/certManager.yml
- name: Create cert-manager namespace
k8s:
state: present
definition:
apiVersion: v1
kind: Namespace
metadata:
name: "{{ namespace }}"
annotations:
linkerd.io/inject: 'enabled'
- name: Add cert-manager repo
kubernetes.core.helm_repository:
name: "{{ helm.releaseName }}"
repo_url: "{{ helm.repoUrl }}"
- name: Install CRDs for cert-manager
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/cert_manager/cert_manager_crds.yaml') | from_yaml_all }}"
- name: Deploy cert-manager from helm chart
kubernetes.core.helm:
name: "{{ helm.releaseName }}"
state: present
chart_ref: "{{ helm.chart }}"
release_namespace: "{{ namespace }}"
chart_version: "{{ helm.chartVersion }}"
update_repo_cache: "true"
- name: Pause for 1.5 minutes and wait for cert-manager webhook
ansible.builtin.pause:
seconds: 90
- name: Deploy cert-manager lets-encrypt staging config file
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/cert_manager/lets_encrypt_staging.yml.j2') | from_yaml_all }}"
- name: Deploy cert-manager lets-encrypt production config file
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/cert_manager/lets_encrypt_production.yml.j2') | from_yaml_all }}"

View File

@ -0,0 +1,17 @@
######################################
# add controller to existing cluster #
######################################
---
- name: copy clusterConfig to remote location
template:
src: '../templates/k8s_cluster/cluster/joinController.yml.j2'
dest: /tmp/joinController.yml
- name: Join the controller node to cluster
command: kubeadm join --config=/tmp/joinController.yml
- name: Setup kubeconfig for local usage
command: "{{ item }}"
loop:
- mkdir -p ~/.kube
- cp -i /etc/kubernetes/admin.conf ~/.kube/config

View File

@ -0,0 +1,12 @@
######################################
# add worker to existing cluster #
######################################
---
- name: Copy the worker join command to server location
copy: src=join_command_worker.sh dest=/tmp/join_command_worker.sh mode=0777
- name: Join the worker node to cluster
command: sh /tmp/join_command_worker.sh
- name: Delete local copy of join worker
local_action: file path=./join_command_worker.sh state=absent

View File

@ -0,0 +1,28 @@
######################################
# Tasks for init k8s cluster #
######################################
---
- name: Generate join command
command: kubeadm token create --print-join-command
register: join_command
- name: Copy join command to local file
local_action: copy content="{{ join_command.stdout_lines[0] }}" dest="./join_command_worker.sh"
- name: Generate join command controller token
command: kubeadm token create
register: join_command_token
- name: Generate join command controller certsKey
command: kubeadm init phase upload-certs --upload-certs
register: join_command_controller_certskey
- name: Generate join command controller certssh256
shell: openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | openssl dgst -sha256 -hex | sed 's/^.* //'
register: join_command_controller_certsha256
- name: save facts for controller join
set_fact:
token: '{{ join_command_token.stdout }}'
certskey: '{{ join_command_controller_certskey.stdout_lines[-1] }}'
certsha256: '{{ join_command_controller_certsha256.stdout }}'

View File

@ -0,0 +1,69 @@
######################################
# Tasks for init k8s cluster #
######################################
---
- name: Get hostname
command: hostname
register: old_hostname
changed_when: false
- set_fact: hostname={{ old_hostname.stdout | lower }}
- name: Pull k8s images
command: kubeadm config images pull --kubernetes-version=v{{ kubernetesVersion }}
- name: copy clusterConfig to remote location
template:
src: '../templates/k8s_cluster/cluster/clusterConfiguration.yml.j2'
dest: /tmp/clusterConfiguration.yml
- name: Initialize the Kubernetes cluster using kubeadm
command:
argv:
- kubeadm
- init
- --config=/tmp/clusterConfiguration.yml
- --node-name={{ hostname }}
- --ignore-preflight-errors
- Swap
- --upload-certs
- name: Remove clusterConfig on remote location
ansible.builtin.file:
path: /tmp/clusterConfiguration.yml
state: absent
- name: Setup kubeconfig for local usage
command: "{{ item }}"
loop:
- mkdir -p ~/.kube
- cp -i /etc/kubernetes/admin.conf ~/.kube/config
- name: Wait for all k8s nodes to be ready
shell: kubectl wait --for=condition=Ready nodes --all --timeout=600s
register: nodes_ready
- name: create Calico NetworkManager directory
file:
path: '/etc/NetworkManager/conf.d/'
state: directory
mode: 0755
- name: Configure Calico NetworkManager
template:
src: ../templates/k8s_cluster/cluster/calico.conf.j2
dest: /etc/NetworkManager/conf.d/calico.conf
owner: root
mode: '0644'
- name: Install calico pod network
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/cluster/calico.yml.j2') | from_yaml_all }}"
- name: Wait for calico daemonset become ready
command: "kubectl rollout status daemonset calico-node -n kube-system --timeout 60s"
- name: Generate join command
command: kubeadm token create --print-join-command
register: join_command

View File

@ -0,0 +1,9 @@
########################################
#Restart DNS after DeamonSet Deployment#
########################################
---
- name: Wait for calico pods become ready
command: "kubectl rollout status daemonset calico-node -n kube-system --timeout 120s"
- name: Restart CoreDNS deployment
command: "kubectl rollout restart deployments/coredns -n kube-system"

View File

@ -0,0 +1,21 @@
######################################
# INstall Helm3 in cluster #
######################################
---
- name: Read helm3 values
include_vars:
file: ../vars/k8s_cluster/helm/helm3.yml
- name: Download Helm install script
get_url:
url: "{{ helm_install_script }}"
dest: "~/get_helm.sh"
mode: 0700
- name: Install Helm
command: "~/get_helm.sh"
- name: Delete Helm install script
file:
state: absent
path: "~/get_helm.sh"

View File

@ -0,0 +1,34 @@
######################################
# Deploy nginx ingress controller #
######################################
---
- name: Read ingress nginx values
include_vars:
file: ../vars/k8s_cluster/ingress/ingressNginx.yml
- name: "Create namespace '{{ namespace }}'"
kubernetes.core.k8s:
state: present
definition:
api_version: v1
kind: Namespace
metadata:
name: '{{ namespace }}'
labels:
name: '{{ namespace }}'
- name: Add nginx ingress controller chart repo
kubernetes.core.helm_repository:
name: "{{ helm.releaseName }}"
repo_url: "{{ helm.repoUrl }}"
- name: Deploy nginx ingress controller from helm chart
kubernetes.core.helm:
name: '{{ helm.releaseName }}'
state: present
chart_ref: '{{ helm.chart }}'
release_namespace: '{{ namespace }}'
chart_version: '{{ helm.chartVersion }}'
update_repo_cache: 'true'
## ToDo: Nginx Controller mit eigenen Values deployen
# values: "{{ lookup('template', '../templates/k8s_cluster/ingress/ingressNginxValues.yml') | from_yaml }}"

View File

@ -0,0 +1,11 @@
######################################
# Deploy kube-vip virtualIP #
######################################
---
- name: Deploy kube-vip as static pod
template:
src: ../templates/k8s_cluster/kube_vip/kube_vip.yml.j2
dest: /etc/kubernetes/manifests/kube-vip.yml
owner: root
mode: '0600'
when: installKubeVip

View File

@ -0,0 +1,129 @@
######################################
#tasks for vanilla kubernetes install#
######################################
---
- name: Get OS version name
command: lsb_release -cs
register: os_codename
changed_when: false
- name: Get OS release number
command: lsb_release -rs
register: os_release
changed_when: false
- name: Add an apt signing key for CRI-O
apt_key:
url: "{{ item }}"
state: present
loop:
- 'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{{ crio_version }}/Debian_{{ os_release.stdout }}/Release.key'
- 'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/Debian_{{ os_release.stdout }}/Release.key'
- name: Add CRI-O apt repository for stable version
apt_repository:
repo: deb http://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/Debian_{{ os_release.stdout }}/ /
filename: devel:kubic:libcontainers:stable.list
state: present
update_cache: yes
- apt_repository:
repo: deb https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{{ crio_version }}/Debian_{{ os_release.stdout }}/ /
filename: devel:kubic:libcontainers:stable:cri-o:{{ crio_version }}.list
state: present
update_cache: yes
- name: Install CRI-O packages
apt:
name: "{{ packages }}"
state: present
update_cache: yes
allow_unauthenticated: true
vars:
packages:
- cri-o
- cri-o-runc
- name: Enable and start CRI-O service
ansible.builtin.systemd:
name: crio.service
state: started
enabled: yes
- name: CRI-O use systemd cgroup driver
copy:
dest: "/etc/crio/crio.conf.d/02-cgroup-manager.conf"
content: |
[crio.runtime]
conmon_cgroup = "pod"
cgroup_manager = "systemd"
- name: Overriding the CRI-O sandbox (pause) image
lineinfile:
path: /etc/crio/crio.conf
regexp: '#? ?pause_image ?= ?"registry\.k8s\.io/pause:(.+)"'
backrefs: True
line: pause_image = "registry.k8s.io/pause:\1"
- name: Forwarding IPv4 and letting iptables see bridged traffic
copy:
dest: "/etc/modules-load.d/k8s.conf"
content: |
overlay
br_netfilter
- name: modprobe overlay & br-netfilter
command: "{{ item }}"
loop:
- modprobe overlay
- modprobe br_netfilter
#sysctl params required by setup, params persist across reboots
- name: ipv4 bridge forward
copy:
dest: "/etc/sysctl.d/k8s.conf"
content: |
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
- name: Apply sysctl params without reboot
command: sysctl --system
- name: Import Kubernetes GPG key
raw: "curl -fsSL https://pkgs.k8s.io/core:/stable:/v{{ kubernetesVersion.split('.')[:2] | join('.') }}/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg"
- name: Add Kubernetes apt repository
raw: "echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v{{ kubernetesVersion.split('.')[:2] | join('.') }}/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list"
- name: Remove swapfile from /etc/fstab
mount:
name: "{{ item }}"
fstype: swap
state: absent
with_items:
- swap
- none
- name: Disable swap
command: swapoff -a
- name: Update apt cache
raw: apt-get -y update
changed_when: False
- name: Install Kubernetes binaries
apt:
name: "{{ packages }}"
state: present
update_cache: yes
vars:
packages:
- "kubelet={{ kubernetesVersion }}-1.1"
- "kubeadm={{ kubernetesVersion }}-1.1"
- "kubectl={{ kubernetesVersion }}-1.1"
- name: Add kubectl completion bash
lineinfile:
path: ~/.bashrc
line: source <(kubectl completion bash)

View File

@ -0,0 +1,39 @@
######################################
# Setup k8s Cluster #
######################################
---
- name: kubernetes installation
block:
- name: vanilla kubernetes install block
when: kubernetesClusterType == 'vanilla'
block:
- name: Read vanilla kubernetes values
include_vars:
file: ../vars/k8s_cluster/kubernetes/vanilla_kubernetes.yml
- import_tasks: ../tasks/k8s_cluster/kubernetes/install_vanilla_kubernetes.yml
# ToDo: find solution for VIP
# - name: Read kube_vip values for virtual IP
# include_vars:
# file: ../vars/k8s_cluster/kube_vip/kube_vip.yml
# - import_tasks: ../tasks/k8s_cluster/kube_vip/install_kube_vip.yml
# when: inventory_hostname in groups['controller']
- import_tasks: ../tasks/k8s_cluster/cluster/vanilla_kubernetes/init_kubernetes_cluster.yml
when: inventory_hostname in groups['controller_init']
- import_tasks: ../tasks/k8s_cluster/cluster/vanilla_kubernetes/generate_join_command.yml
when: inventory_hostname in groups['controller_init']
#ToDo: when controller replica exists
## - import_tasks: ../tasks/k8s_cluster/cluster/vanilla_kubernetes/add_controller_to_cluster.yml
## when: inventory_hostname in groups['controller_replica']
- import_tasks: ../tasks/k8s_cluster/cluster/vanilla_kubernetes/add_worker_to_cluster.yml
when: inventory_hostname in groups['worker']
- import_tasks: ../tasks/k8s_cluster/cluster/vanilla_kubernetes/restart_coredns.yml
when: inventory_hostname in groups['controller_init']
#
# - name: install microk8s block
# when: kubernetesClusterType == 'microk8s'
# block:
# - debug: msg='ToDo install microk8s'

View File

@ -0,0 +1,46 @@
######################################
# Install MetalLB in cluster #
######################################
---
- name: Read metallb values
include_vars:
file: ../vars/k8s_cluster/loadbalancer/metallb.yml
- name: Create metallb namespace
k8s:
state: present
definition:
apiVersion: v1
kind: Namespace
metadata:
name: "{{ namespace }}"
annotations:
linkerd.io/inject: 'enabled'
- name: Add metallb repo
kubernetes.core.helm_repository:
name: "{{ helm.releaseName }}"
repo_url: "{{ helm.repoUrl }}"
- name: Deploy metalb from helm chart
kubernetes.core.helm:
name: "{{ helm.releaseName }}"
state: present
chart_ref: "{{ helm.chart }}"
release_namespace: "{{ namespace }}"
chart_version: "{{ helm.chartVersion }}"
update_repo_cache: "true"
values: "{{ lookup('template', '../templates/k8s_cluster/loadbalancer/metallb.yml') | from_yaml }}"
- name: Pause for 25 seconds and wait for metallb
ansible.builtin.pause:
seconds: 25
- name: Pause for 15 seconds and wait for metallb webhook
ansible.builtin.pause:
seconds: 15
- name: Deploy metallb IPAddressPool
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/loadbalancer/metal_lb_configmap.yml.j2') | from_yaml_all }}"

View File

@ -0,0 +1,42 @@
######################################
# Install linkerd service mesh #
######################################
---
- name: install linkerd service mesh
when: inventory_hostname in groups['controller']
block:
- name: Download linkerd install scrip
get_url:
url: https://run.linkerd.io/install
dest: /tmp/linkerd.sh
- name: Install linkerd CLI
shell:
cmd: cat /tmp/linkerd.sh | sh
- name: Set linkerd .bashrc
lineinfile:
path: ~/.bashrc
line: 'PATH=$PATH:/root/.linkerd2/bin'
- name: init linkerd on controller1
when: inventory_hostname in groups['controller_init']
block:
- name: Install linkerd CRD in Cluster
shell: "linkerd install --crds | kubectl apply -f -"
- name: Install linkerd in Cluster
shell: "linkerd install | kubectl apply -f -"
- name: Wait for linkerd pods become ready
command: "kubectl rollout status deployment linkerd-destination -n linkerd --timeout 150s"
- name: Wait for linkerd pods become ready
command: "kubectl rollout status deployment linkerd-proxy-injector -n linkerd --timeout 150s"
- name: Install linkerd Dashboard
shell: "linkerd viz install | kubectl apply -f -"
- name: Pause for 15 seconds and wait for linkerd installation
pause:
seconds: 15

View File

@ -0,0 +1,27 @@
######################################
# Setup ceph prometheus Monitoring #
######################################
---
- name: Storage monitoring block
when: inventory_hostname in groups['kubernetes_api']
block:
- name: Read rook-ceph storage values
include_vars:
file: ../vars/k8s_cluster/storage/rook_ceph.yml
- name: Deploy rook CRDs, common resources and operator from manifest
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/monitoring/{{ item }}') | from_yaml_all }}"
loop:
- 'csi-metrics-service-monitor.yaml'
- 'service-monitor.yaml'
- 'rbac.yaml'
- name: Setting monitoring fact rook-ceph
set_fact: cephMonitoring=true
- name: Deploy rook cluster from manifest
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/cluster' + ('-test' if rook_cluster_type == 'dev' else '') + '.yaml') | from_yaml_all }}"

View File

@ -0,0 +1,82 @@
######################################
# Setup rook-ceph storage #
######################################
---
- name: "Create namespace '{{ namespace }}'"
kubernetes.core.k8s:
state: present
definition:
api_version: v1
kind: Namespace
metadata:
name: '{{ namespace }}'
labels:
name: '{{ namespace }}'
annotations:
linkerd.io/inject: 'enabled'
- name: Deploy rook CRDs, common resources and operator from manifest
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/{{ item }}') | from_yaml_all }}"
loop:
- 'crds.yaml'
- 'common.yaml'
- 'operator.yaml'
- name: Verify if the rook operator is up and running
k8s:
kind: Deployment
name: rook-ceph-operator
namespace: "rook-ceph"
register: ret
until: "ret.get('result', {}).get('status', {}).get('conditions', []) | length and ret.get('result', {}).get('status', {}).get('conditions', [])[0].get('status') == 'True'"
retries: 10
delay: 20
# ToDo: Tobi bitte prüfen, ob die Methode so okay ist? Monitoring wird in k8scluster/storage/cephAddPrometheus nochmal gesetzt
- name: Setting monitoring fact rook-ceph
set_fact: cephMonitoring=false
- name: Deploy rook cluster from manifest
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/cluster' + ('-test' if rook_cluster_type == 'dev' else '') + '.yaml') | from_yaml_all }}"
- name: Verify the cluster deploy is complete
k8s:
kind: CephCluster
name: '{{ rook_cluster_config["name"] }}'
namespace: "rook-ceph"
register: cluster_data
until: "cluster_data.get('result', {}).get('status', {}).get('state') == 'Created'"
retries: 20
delay: 30
ignore_errors: yes
- name: Safety puffer for osd enrollment
pause:
seconds: 60
- name: Deploy rook block storage class
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/csi/rbd/storageclass' + ('-test' if rook_cluster_type == 'dev' else '') + '.yaml') | from_yaml_all }}"
- name: Create rook filesystem
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/filesystem' + ('-test' if rook_cluster_type == 'dev' else '') + '.yaml') | from_yaml_all }}"
- name: Safety puffer for filesystem enrolment
pause:
seconds: 25
- name: Wait ceph fs pods become ready
shell: kubectl wait --namespace=rook-ceph --for=condition=Ready pods --selector app=rook-ceph-mds --timeout=600s
register: ceph_pods_ready
- name: Deploy rook file storage class
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', '../templates/k8s_cluster/storage/rook/csi/cephfs/storageclass.yaml') | from_yaml_all }}"

View File

@ -0,0 +1,15 @@
######################################
# Setup Storage #
######################################
---
- name: Storage block
when: inventory_hostname in (groups['controller_init'])
block:
- name: Read rook-ceph storage values
include_vars:
file: ../vars/k8s_cluster/storage/rook_ceph.yml
- name: rook internal ceph
when: inventory_hostname in groups['controller_init']
block:
- import_tasks: ../tasks/k8s_cluster/storage/install_rook_ceph_storage.yml

View File

@ -0,0 +1,38 @@
#######################################
## Tasks to prepare a Debian System #
#######################################
---
- name: Read debian values and prepare system
include_vars:
file: ../vars/k8s_cluster/system/debian.yml
- name: Update apt cache
raw: apt-get -y update
changed_when: False
- name: Install required system packages
apt: name={{ sys_packages }} state=present update_cache=yes cache_valid_time=3600
- name: Install required kubernetes system packages
apt: name={{ k8s_sys_packages }} state=present update_cache=yes cache_valid_time=3600
when: inventory_hostname in groups['kubernetes']
- name: Delete EXTERNALLY-MANAGED python venv
ansible.builtin.file:
state: absent
path: /usr/lib/python3.11/EXTERNALLY-MANAGED
- name: Install required Python modules
pip: name={{ pip_packages }} state=present
when: inventory_hostname in groups['kubernetes']
- name: Get hostname
command: hostname
register: old_hostname
changed_when: false
- set_fact: hostname={{ old_hostname.stdout | lower }}
# No capital letters in the hostname
- name: Change the hostname
command: hostnamectl set-hostname {{ hostname }}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-production
spec:
acme:
server: https://acme-v02.api.letsencrypt.org/directory
email: {{ email }}
privateKeySecretRef:
name: letsencrypt-cluster-issuer-key
solvers:
- http01:
ingress:
class: nginx

View File

@ -0,0 +1,14 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
spec:
acme:
server: https://acme-staging-v02.api.letsencrypt.org/directory
email: {{ email }}
privateKeySecretRef:
name: letsencrypt-staging
solvers:
- http01:
ingress:
class: nginx

View File

@ -0,0 +1,2 @@
[keyfile]
unmanaged-devices=interface-name:cali*;interface-name:tunl*;interface-name:vxlan.calico;interface-name:wireguard.cali

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
apiVersion: kubeadm.k8s.io/v1beta3
kind: InitConfiguration
localAPIEndpoint:
advertiseAddress: '{{ hostvars['k8s-master-1'].ansible_host }}'
bindPort: 6443
---
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
kubernetesVersion: 'v{{ kubernetesVersion.split("-")[0] }}'
apiServer:
extraArgs:
authorization-mode: Node,RBAC
timeoutForControlPlane: 4m0s
certificatesDir: /etc/kubernetes/pki
clusterName: '{{ kubernetesClusterName }}'
controlPlaneEndpoint: '{{ kubernetesApi }}:6443'
controllerManager:
extraArgs:
bind-address: 0.0.0.0
dns: {}
etcd:
local:
dataDir: /var/lib/etcd
extraArgs:
listen-metrics-urls: http://0.0.0.0:2381
networking:
dnsDomain: cluster.local
serviceSubnet: 10.96.0.0/12
scheduler:
extraArgs:
bind-address: 0.0.0.0
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
metricsBindAddress: 0.0.0.0

View File

@ -0,0 +1,15 @@
apiVersion: kubeadm.k8s.io/v1beta3
kind: JoinConfiguration
discovery:
bootstrapToken:
apiServerEndpoint: {{ kubernetesApi }}:6443
caCertHashes:
- sha256:{{ hostvars['k8s-master-1']['certsha256'] }}
token: {{ hostvars['k8s-master-1']['token'] }}
nodeRegistration:
kubeletExtraArgs:
node-ip: {{ ansible_host }}
controlPlane:
localAPIEndpoint:
advertiseAddress: {{ ansible_host }}
certificateKey: {{ hostvars['k8s-master-1']['certskey'] }}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,61 @@
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
name: kube-vip
namespace: kube-system
spec:
containers:
- args:
- manager
env:
- name: vip_arp
value: "true"
- name: port
value: "6443"
- name: vip_interface
value: {{ interface }}
- name: vip_cidr
value: "32"
- name: cp_enable
value: "true"
- name: cp_namespace
value: kube-system
- name: vip_ddns
value: "false"
- name: svc_enable
value: "true"
- name: vip_leaderelection
value: "true"
- name: vip_leaseduration
value: "5"
- name: vip_renewdeadline
value: "3"
- name: vip_retryperiod
value: "1"
- name: address
value: {{ virtual_ip }}
- name: prometheus_server
value: :2112
image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
imagePullPolicy: Always
name: kube-vip
resources: {}
securityContext:
capabilities:
add:
- NET_ADMIN
- NET_RAW
volumeMounts:
- mountPath: /etc/kubernetes/admin.conf
name: kubeconfig
hostAliases:
- hostnames:
- kubernetes
ip: 127.0.0.1
hostNetwork: true
volumes:
- hostPath:
path: /etc/kubernetes/admin.conf
name: kubeconfig
status: {}

View File

@ -0,0 +1,8 @@
apiVersion: metallb.io/v1beta1
kind: IPAddressPool
metadata:
name: test
namespace: metallb
spec:
addresses:
- 192.168.10.0/24

View File

@ -0,0 +1,363 @@
# Default values for metallb.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
loadBalancerClass: ""
# To configure MetalLB, you must specify ONE of the following two
# options.
rbac:
# create specifies whether to install and use RBAC rules.
create: true
prometheus:
# scrape annotations specifies whether to add Prometheus metric
# auto-collection annotations to pods. See
# https://github.com/prometheus/prometheus/blob/release-2.1/documentation/examples/prometheus-kubernetes.yml
# for a corresponding Prometheus configuration. Alternatively, you
# may want to use the Prometheus Operator
# (https://github.com/coreos/prometheus-operator) for more powerful
# monitoring configuration. If you use the Prometheus operator, this
# can be left at false.
scrapeAnnotations: false
# port both controller and speaker will listen on for metrics
metricsPort: 7472
# if set, enables rbac proxy on the controller and speaker to expose
# the metrics via tls.
# secureMetricsPort: 9120
# the name of the secret to be mounted in the speaker pod
# to expose the metrics securely. If not present, a self signed
# certificate to be used.
speakerMetricsTLSSecret: ""
# the name of the secret to be mounted in the controller pod
# to expose the metrics securely. If not present, a self signed
# certificate to be used.
controllerMetricsTLSSecret: ""
# prometheus doens't have the permission to scrape all namespaces so we give it permission to scrape metallb's one
rbacPrometheus: true
# the service account used by prometheus
# required when " .Values.prometheus.rbacPrometheus == true " and " .Values.prometheus.podMonitor.enabled=true or prometheus.serviceMonitor.enabled=true "
serviceAccount: ""
# the namespace where prometheus is deployed
# required when " .Values.prometheus.rbacPrometheus == true " and " .Values.prometheus.podMonitor.enabled=true or prometheus.serviceMonitor.enabled=true "
namespace: ""
# the image to be used for the kuberbacproxy container
rbacProxy:
repository: gcr.io/kubebuilder/kube-rbac-proxy
tag: v0.12.0
pullPolicy:
# Prometheus Operator PodMonitors
podMonitor:
# enable support for Prometheus Operator
enabled: false
# optional additionnal labels for podMonitors
additionalLabels: {}
# optional annotations for podMonitors
annotations: {}
# Job label for scrape target
jobLabel: "app.kubernetes.io/name"
# Scrape interval. If not set, the Prometheus default scrape interval is used.
interval:
# metric relabel configs to apply to samples before ingestion.
metricRelabelings: []
# - action: keep
# regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
# sourceLabels: [__name__]
# relabel configs to apply to samples before ingestion.
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# target_label: nodename
# replacement: $1
# action: replace
# Prometheus Operator ServiceMonitors. To be used as an alternative
# to podMonitor, supports secure metrics.
serviceMonitor:
# enable support for Prometheus Operator
enabled: false
speaker:
# optional additional labels for the speaker serviceMonitor
additionalLabels: {}
# optional additional annotations for the speaker serviceMonitor
annotations: {}
# optional tls configuration for the speaker serviceMonitor, in case
# secure metrics are enabled.
tlsConfig:
insecureSkipVerify: true
controller:
# optional additional labels for the controller serviceMonitor
additionalLabels: {}
# optional additional annotations for the controller serviceMonitor
annotations: {}
# optional tls configuration for the controller serviceMonitor, in case
# secure metrics are enabled.
tlsConfig:
insecureSkipVerify: true
# Job label for scrape target
jobLabel: "app.kubernetes.io/name"
# Scrape interval. If not set, the Prometheus default scrape interval is used.
interval:
# metric relabel configs to apply to samples before ingestion.
metricRelabelings: []
# - action: keep
# regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
# sourceLabels: [__name__]
# relabel configs to apply to samples before ingestion.
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# target_label: nodename
# replacement: $1
# action: replace
# Prometheus Operator alertmanager alerts
prometheusRule:
# enable alertmanager alerts
enabled: false
# optional additionnal labels for prometheusRules
additionalLabels: {}
# optional annotations for prometheusRules
annotations: {}
# MetalLBStaleConfig
staleConfig:
enabled: true
labels:
severity: warning
# MetalLBConfigNotLoaded
configNotLoaded:
enabled: true
labels:
severity: warning
# MetalLBAddressPoolExhausted
addressPoolExhausted:
enabled: true
labels:
severity: alert
addressPoolUsage:
enabled: true
thresholds:
- percent: 75
labels:
severity: warning
- percent: 85
labels:
severity: warning
- percent: 95
labels:
severity: alert
# MetalLBBGPSessionDown
bgpSessionDown:
enabled: true
labels:
severity: alert
extraAlerts: []
# controller contains configuration specific to the MetalLB cluster
# controller.
controller:
enabled: true
# -- Controller log level. Must be one of: `all`, `debug`, `info`, `warn`, `error` or `none`
logLevel: info
# command: /controller
# webhookMode: enabled
image:
repository: quay.io/metallb/controller
tag:
pullPolicy:
## @param controller.updateStrategy.type Metallb controller deployment strategy type.
## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
## e.g:
## strategy:
## type: RollingUpdate
## rollingUpdate:
## maxSurge: 25%
## maxUnavailable: 25%
##
strategy:
type: RollingUpdate
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true
# The name of the ServiceAccount to use. If not set and create is
# true, a name is generated using the fullname template
name: ""
annotations: {}
securityContext:
runAsNonRoot: true
# nobody
runAsUser: 65534
fsGroup: 65534
resources: {}
# limits:
# cpu: 100m
# memory: 100Mi
nodeSelector: {}
tolerations: []
priorityClassName: ""
runtimeClassName: ""
affinity: {}
podAnnotations: {}
labels: {}
livenessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
tlsMinVersion: "VersionTLS12"
tlsCipherSuites: ""
extraContainers: []
# speaker contains configuration specific to the MetalLB speaker
# daemonset.
speaker:
enabled: true
# command: /speaker
# -- Speaker log level. Must be one of: `all`, `debug`, `info`, `warn`, `error` or `none`
logLevel: info
tolerateMaster: true
memberlist:
enabled: true
mlBindPort: 7946
mlBindAddrOverride: ""
mlSecretKeyPath: "/etc/ml_secret_key"
excludeInterfaces:
enabled: true
# ignore the exclude-from-external-loadbalancer label
ignoreExcludeLB: false
image:
repository: quay.io/metallb/speaker
tag:
pullPolicy:
## @param speaker.updateStrategy.type Speaker daemonset strategy type
## ref: https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/
##
updateStrategy:
## StrategyType
## Can be set to RollingUpdate or OnDelete
##
type: RollingUpdate
serviceAccount:
# Specifies whether a ServiceAccount should be created
create: true
# The name of the ServiceAccount to use. If not set and create is
# true, a name is generated using the fullname template
name: ""
annotations: {}
securityContext: {}
## Defines a secret name for the controller to generate a memberlist encryption secret
## By default secretName: {{ "metallb.fullname" }}-memberlist
##
# secretName:
resources: {}
# limits:
# cpu: 100m
# memory: 100Mi
nodeSelector: {}
tolerations: []
priorityClassName: ""
affinity: {}
## Selects which runtime class will be used by the pod.
runtimeClassName: ""
podAnnotations: {}
labels: {}
livenessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
enabled: true
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
startupProbe:
enabled: true
failureThreshold: 30
periodSeconds: 5
# frr contains configuration specific to the MetalLB FRR container,
# for speaker running alongside FRR.
frr:
enabled: true
image:
repository: quay.io/frrouting/frr
tag: 9.0.2
pullPolicy:
metricsPort: 7473
resources: {}
# if set, enables a rbac proxy sidecar container on the speaker to
# expose the frr metrics via tls.
# secureMetricsPort: 9121
reloader:
resources: {}
frrMetrics:
resources: {}
extraContainers: []
crds:
enabled: true
validationFailurePolicy: Fail
# frrk8s contains the configuration related to using an frrk8s instance
# (github.com/metallb/frr-k8s) as the backend for the BGP implementation.
# This allows configuring additional frr parameters in combination to those
# applied by MetalLB.
frrk8s:
# if set, enables frrk8s as a backend. This is mutually exclusive to frr
# mode.
enabled: false

View File

@ -0,0 +1,2 @@
rook version v1.11.4
For documentation on running Rook in your Kubernetes cluster see the [Kubernetes Quickstart Guide](/Documentation/Getting-Started/quickstart.md)

View File

@ -0,0 +1,22 @@
#################################################################################################################
# Define the settings for the rook-ceph-external cluster with common settings for a production cluster.
# For example, if Rook is not managing any existing cluster in the 'rook-ceph' namespace do:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster-external.yaml
# If there is already a cluster managed by Rook in 'rook-ceph' then run:
# kubectl create -f common-external.yaml -f cluster-external-management.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph-external
namespace: rook-ceph-external # namespace:cluster
spec:
external:
enable: true
dataDirHostPath: /var/lib/rook
# providing an image is required, if you want to create other CRs (rgw, mds, nfs)
cephVersion:
image: quay.io/ceph/ceph:v17.2.6 # Should match external cluster version

View File

@ -0,0 +1,39 @@
#################################################################################################################
# Define the settings for the rook-ceph-external cluster with common settings for a production cluster.
# For example, if Rook is not managing any existing cluster in the 'rook-ceph' namespace do:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster-external.yaml
# If there is already a cluster managed by Rook in 'rook-ceph' then do:
# kubectl create -f common-external.yaml
# kubectl create -f cluster-external.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph-external
namespace: rook-ceph-external # namespace:cluster
spec:
external:
enable: true
crashCollector:
disable: true
network:
connections:
encryption:
enabled: false
compression:
enabled: false
healthCheck:
daemonHealth:
mon:
disabled: false
interval: 45s
# optionally, the ceph-mgr IP address can be passed to gather metric from the prometheus exporter
# monitoring:
# enabled: true
# rulesNamespace: rook-ceph
# externalMgrEndpoints:
#- ip: ip
# externalMgrPrometheusPort: 9283

View File

@ -0,0 +1,63 @@
#################################################################################################################
# Define the settings for the rook-ceph cluster with common settings for a small test cluster.
# All nodes with available raw devices will be used for the Ceph cluster. One node is sufficient
# in this example.
# For example, to create the cluster:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster-test.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: {{ rook_cluster_configs.dev.name }}
namespace: rook-ceph # namespace:cluster
spec:
dataDirHostPath: /var/lib/rook
cephVersion:
image: quay.io/ceph/ceph:v18
allowUnsupported: true
mon:
count: {{ rook_cluster_configs.dev.mons }}
allowMultiplePerNode: true
mgr:
count: {{ rook_cluster_configs.dev.mgrs }}
allowMultiplePerNode: true
dashboard:
enabled: true
crashCollector:
disable: true
storage:
useAllNodes: true
useAllDevices: true
#deviceFilter:
monitoring:
enabled: false
healthCheck:
daemonHealth:
mon:
interval: 45s
timeout: 600s
priorityClassNames:
all: system-node-critical
mgr: system-cluster-critical
disruptionManagement:
managePodBudgets: true
cephConfig:
global:
osd_pool_default_size: "1"
mon_warn_on_pool_no_redundancy: "false"
bdev_flock_retry: "20"
bluefs_buffered_io: "false"
mon_data_avail_warn: "10"
---
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: builtin-mgr
namespace: rook-ceph # namespace:cluster
spec:
name: .mgr
replicated:
size: 1
requireSafeReplicaSize: false

View File

@ -0,0 +1,312 @@
#################################################################################################################
# Define the settings for the rook-ceph cluster with common settings for a production cluster.
# All nodes with available raw devices will be used for the Ceph cluster. At least three nodes are required
# in this example. See the documentation for more details on storage settings available.
# For example, to create the cluster:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph
namespace: rook-ceph # namespace:cluster
spec:
cephVersion:
# The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
# v16 is Pacific, and v17 is Quincy.
# RECOMMENDATION: In production, use a specific version tag instead of the general v17 flag, which pulls the latest release and could result in different
# versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
# If you want to be more precise, you can always use a timestamp tag such quay.io/ceph/ceph:v17.2.6-20230410
# This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
image: quay.io/ceph/ceph:v17.2.6
# Whether to allow unsupported versions of Ceph. Currently `pacific`, `quincy`, and `reef` are supported.
# Future versions such as `squid` (v19) would require this to be set to `true`.
# Do not set to true in production.
allowUnsupported: false
# The path on the host where configuration files will be persisted. Must be specified.
# Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
# In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
dataDirHostPath: /var/lib/rook
# Whether or not upgrade should continue even if a check fails
# This means Ceph's status could be degraded and we don't recommend upgrading but you might decide otherwise
# Use at your OWN risk
# To understand Rook's upgrade process of Ceph, read https://rook.io/docs/rook/latest/ceph-upgrade.html#ceph-version-upgrades
skipUpgradeChecks: false
# Whether or not continue if PGs are not clean during an upgrade
continueUpgradeAfterChecksEvenIfNotHealthy: false
# WaitTimeoutForHealthyOSDInMinutes defines the time (in minutes) the operator would wait before an OSD can be stopped for upgrade or restart.
# If the timeout exceeds and OSD is not ok to stop, then the operator would skip upgrade for the current OSD and proceed with the next one
# if `continueUpgradeAfterChecksEvenIfNotHealthy` is `false`. If `continueUpgradeAfterChecksEvenIfNotHealthy` is `true`, then operator would
# continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
# The default wait timeout is 10 minutes.
waitTimeoutForHealthyOSDInMinutes: 10
mon:
# Set the number of mons to be started. Generally recommended to be 3.
# For highest availability, an odd number of mons should be specified.
count: 3
# The mons should be on unique nodes. For production, at least 3 nodes are recommended for this reason.
# Mons should only be allowed on the same node for test environments where data loss is acceptable.
allowMultiplePerNode: false
mgr:
# When higher availability of the mgr is needed, increase the count to 2.
# In that case, one mgr will be active and one in standby. When Ceph updates which
# mgr is active, Rook will update the mgr services to match the active mgr.
count: 2
allowMultiplePerNode: false
modules:
# Several modules should not need to be included in this list. The "dashboard" and "monitoring" modules
# are already enabled by other settings in the cluster CR.
- name: pg_autoscaler
enabled: true
# enable the ceph dashboard for viewing cluster status
dashboard:
enabled: true
# serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)
# urlPrefix: /ceph-dashboard
# serve the dashboard at the given port.
# port: 8443
# serve the dashboard using SSL
ssl: true
# The url of the Prometheus instance
# prometheusEndpoint: <protocol>://<prometheus-host>:<port>
# Whether SSL should be verified if the Prometheus server is using https
# prometheusEndpointSSLVerify: false
# enable prometheus alerting for cluster
monitoring:
# requires Prometheus to be pre-installed
enabled: false
# Whether to disable the metrics reported by Ceph. If false, the prometheus mgr module and Ceph exporter are enabled.
# If true, the prometheus mgr module and Ceph exporter are both disabled. Default is false.
metricsDisabled: false
network:
connections:
# Whether to encrypt the data in transit across the wire to prevent eavesdropping the data on the network.
# The default is false. When encryption is enabled, all communication between clients and Ceph daemons, or between Ceph daemons will be encrypted.
# When encryption is not enabled, clients still establish a strong initial authentication and data integrity is still validated with a crc check.
# IMPORTANT: Encryption requires the 5.11 kernel for the latest nbd and cephfs drivers. Alternatively for testing only,
# you can set the "mounter: rbd-nbd" in the rbd storage class, or "mounter: fuse" in the cephfs storage class.
# The nbd and fuse drivers are *not* recommended in production since restarting the csi driver pod will disconnect the volumes.
encryption:
enabled: false
# Whether to compress the data in transit across the wire. The default is false.
# Requires Ceph Quincy (v17) or newer. Also see the kernel requirements above for encryption.
compression:
enabled: false
# Whether to require communication over msgr2. If true, the msgr v1 port (6789) will be disabled
# and clients will be required to connect to the Ceph cluster with the v2 port (3300).
# Requires a kernel that supports msgr v2 (kernel 5.11 or CentOS 8.4 or newer).
requireMsgr2: false
# enable host networking
#provider: host
# enable the Multus network provider
#provider: multus
#selectors:
# The selector keys are required to be `public` and `cluster`.
# Based on the configuration, the operator will do the following:
# 1. if only the `public` selector key is specified both public_network and cluster_network Ceph settings will listen on that interface
# 2. if both `public` and `cluster` selector keys are specified the first one will point to 'public_network' flag and the second one to 'cluster_network'
#
# In order to work, each selector value must match a NetworkAttachmentDefinition object in Multus
#
# public: public-conf --> NetworkAttachmentDefinition object name in Multus
# cluster: cluster-conf --> NetworkAttachmentDefinition object name in Multus
# Provide internet protocol version. IPv6, IPv4 or empty string are valid options. Empty string would mean IPv4
#ipFamily: "IPv6"
# Ceph daemons to listen on both IPv4 and Ipv6 networks
#dualStack: false
# Enable multiClusterService to export the mon and OSD services to peer cluster.
# This is useful to support RBD mirroring between two clusters having overlapping CIDRs.
# Ensure that peer clusters are connected using an MCS API compatible application, like Globalnet Submariner.
#multiClusterService:
# enabled: false
# enable the crash collector for ceph daemon crash collection
crashCollector:
disable: false
# Uncomment daysToRetain to prune ceph crash entries older than the
# specified number of days.
#daysToRetain: 30
# enable log collector, daemons will log on files and rotate
logCollector:
enabled: true
periodicity: daily # one of: hourly, daily, weekly, monthly
maxLogSize: 500M # SUFFIX may be 'M' or 'G'. Must be at least 1M.
# automate [data cleanup process](https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/ceph-teardown.md#delete-the-data-on-hosts) in cluster destruction.
cleanupPolicy:
# Since cluster cleanup is destructive to data, confirmation is required.
# To destroy all Rook data on hosts during uninstall, confirmation must be set to "yes-really-destroy-data".
# This value should only be set when the cluster is about to be deleted. After the confirmation is set,
# Rook will immediately stop configuring the cluster and only wait for the delete command.
# If the empty string is set, Rook will not destroy any data on hosts during uninstall.
confirmation: ""
# sanitizeDisks represents settings for sanitizing OSD disks on cluster deletion
sanitizeDisks:
# method indicates if the entire disk should be sanitized or simply ceph's metadata
# in both case, re-install is possible
# possible choices are 'complete' or 'quick' (default)
method: quick
# dataSource indicate where to get random bytes from to write on the disk
# possible choices are 'zero' (default) or 'random'
# using random sources will consume entropy from the system and will take much more time then the zero source
dataSource: zero
# iteration overwrite N times instead of the default (1)
# takes an integer value
iteration: 1
# allowUninstallWithVolumes defines how the uninstall should be performed
# If set to true, cephCluster deletion does not wait for the PVs to be deleted.
allowUninstallWithVolumes: false
# To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
# The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
# tolerate taints with a key of 'storage-node'.
# placement:
# all:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: role
# operator: In
# values:
# - storage-node
# podAffinity:
# podAntiAffinity:
# topologySpreadConstraints:
# tolerations:
# - key: storage-node
# operator: Exists
# The above placement information can also be specified for mon, osd, and mgr components
# mon:
# Monitor deployments may contain an anti-affinity rule for avoiding monitor
# collocation on the same node. This is a required rule when host network is used
# or when AllowMultiplePerNode is false. Otherwise this anti-affinity rule is a
# preferred rule with weight: 50.
# osd:
# prepareosd:
# mgr:
# cleanup:
annotations:
# all:
# mon:
# osd:
# cleanup:
prepareosd: {linkerd.io/inject: disabled}
# clusterMetadata annotations will be applied to only `rook-ceph-mon-endpoints` configmap and the `rook-ceph-mon` and `rook-ceph-admin-keyring` secrets.
# And clusterMetadata annotations will not be merged with `all` annotations.
# clusterMetadata:
# kubed.appscode.com/sync: "true"
# If no mgr annotations are set, prometheus scrape annotations will be set by default.
# mgr:
labels:
# all:
# mon:
# osd:
# cleanup:
# mgr:
# prepareosd:
# monitoring is a list of key-value pairs. It is injected into all the monitoring resources created by operator.
# These labels can be passed as LabelSelector to Prometheus
# monitoring:
# crashcollector:
resources:
#The requests and limits set here, allow the mgr pod to use half of one CPU core and 1 gigabyte of memory
# mgr:
# limits:
# cpu: "500m"
# memory: "1024Mi"
# requests:
# cpu: "500m"
# memory: "1024Mi"
# The above example requests/limits can also be added to the other components
# mon:
# osd:
# For OSD it also is a possible to specify requests/limits based on device class
# osd-hdd:
# osd-ssd:
# osd-nvme:
# prepareosd:
# mgr-sidecar:
# crashcollector:
# logcollector:
# cleanup:
# exporter:
# The option to automatically remove OSDs that are out and are safe to destroy.
removeOSDsIfOutAndSafeToRemove: false
priorityClassNames:
#all: rook-ceph-default-priority-class
mon: system-node-critical
osd: system-node-critical
mgr: system-cluster-critical
#crashcollector: rook-ceph-crashcollector-priority-class
storage: # cluster level storage configuration and selection
useAllNodes: true
useAllDevices: true
#deviceFilter:
config:
# crushRoot: "custom-root" # specify a non-default root label for the CRUSH map
# metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
# databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
# osdsPerDevice: "1" # this value can be overridden at the node or device level
# encryptedDevice: "true" # the default value for this option is "false"
# Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
# nodes below will be used as storage resources. Each node's 'name' field should match their 'kubernetes.io/hostname' label.
# nodes:
# - name: "172.17.4.201"
# devices: # specific devices to use for storage can be specified for each node
# - name: "sdb"
# - name: "nvme01" # multiple osds can be created on high performance devices
# config:
# osdsPerDevice: "5"
# - name: "/dev/disk/by-id/ata-ST4000DM004-XXXX" # devices can be specified using full udev paths
# config: # configuration can be specified at the node level which overrides the cluster level config
# - name: "172.17.4.301"
# deviceFilter: "^sd."
# when onlyApplyOSDPlacement is false, will merge both placement.All() and placement.osd
onlyApplyOSDPlacement: false
# Time for which an OSD pod will sleep before restarting, if it stopped due to flapping
# flappingRestartIntervalHours: 24
# The section for configuring management of daemon disruptions during upgrade or fencing.
disruptionManagement:
# If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically
# via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will
# block eviction of OSDs by default and unblock them safely when drains are detected.
managePodBudgets: true
# A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
# default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true`. The default value is `30` minutes.
osdMaintenanceTimeout: 30
# A duration in minutes that the operator will wait for the placement groups to become healthy (active+clean) after a drain was completed and OSDs came back up.
# Operator will continue with the next drain if the timeout exceeds. It only works if `managePodBudgets` is `true`.
# No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
pgHealthCheckTimeout: 0
# healthChecks
# Valid values for daemons are 'mon', 'osd', 'status'
healthCheck:
daemonHealth:
mon:
disabled: false
interval: 45s
osd:
disabled: false
interval: 60s
status:
disabled: false
interval: 60s
# Change pod liveness probe timing or threshold values. Works for all mon,mgr,osd daemons.
livenessProbe:
mon:
disabled: false
mgr:
disabled: false
osd:
disabled: false
# Change pod startup probe timing or threshold values. Works for all mon,mgr,osd daemons.
startupProbe:
mon:
disabled: false
mgr:
disabled: false
osd:
disabled: false

View File

@ -0,0 +1,77 @@
###################################################################################################################
# Create the common resources that are necessary to start an external Ceph cluster in a different namespace
# These resources can be created after an operator that is already running but assumes common.yaml has been injected
# The samples all assume that your existing operator running "rook-ceph" namespace will also watch and have permissions
# to interact with an external cluster configured in "rook-ceph-external" cluster.
#
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml -f common-external.yaml
#
# If there is no cluster managed by the current Rook Operator
# you can simply replace all occurrence of rook-ceph-external with rook-ceph
#
# And remove the following code:
#
# apiVersion: v1
# kind: Namespace
# metadata:
# name: rook-ceph-external
#
# Then kubectl create -f cluster-external.yaml
###################################################################################################################
apiVersion: v1
kind: Namespace
metadata:
name: rook-ceph-external # namespace:cluster
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-cluster-mgmt
namespace: rook-ceph-external # namespace:cluster
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: rook-ceph-cluster-mgmt
subjects:
- kind: ServiceAccount
name: rook-ceph-system
namespace: rook-ceph # namespace:operator
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-cmd-reporter
namespace: rook-ceph-external # namespace:cluster
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: rook-ceph-cmd-reporter
subjects:
- kind: ServiceAccount
name: rook-ceph-cmd-reporter
namespace: rook-ceph-external # namespace:cluster
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: rook-ceph-cmd-reporter
namespace: rook-ceph-external # namespace:cluster
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-cmd-reporter
namespace: rook-ceph-external # namespace:cluster
rules:
- apiGroups:
- ""
resources:
- pods
- configmaps
verbs:
- get
- list
- watch
- create
- update
- delete

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-cephfs
# Change "rook-ceph" provisioner prefix to match the operator namespace if needed
provisioner: rook-ceph.cephfs.csi.ceph.com # driver:namespace:operator
parameters:
# clusterID is the namespace where the rook cluster is running
# If you change this namespace, also change the namespace below where the secret namespaces are defined
clusterID: rook-ceph-external # namespace:cluster
# CephFS filesystem name into which the volume shall be created
fsName: sharedStoreK8s
# Ceph pool into which the volume shall be created
# Required for provisionVolume: "true"
pool: sharedStoreK8s_data
# The secrets contain Ceph admin credentials. These are generated automatically by the operator
# in the same namespace as the cluster.
csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph-external # namespace:cluster
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph-external # namespace:cluster
csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph-external # namespace:cluster
# (optional) The driver can use either ceph-fuse (fuse) or ceph kernel client (kernel)
# If omitted, default volume mounter will be used - this is determined by probing for ceph-fuse
# or by setting the default mounter explicitly via --volumemounter command-line argument.
# mounter: kernel
reclaimPolicy: Delete
allowVolumeExpansion: true
mountOptions:
# uncomment the following line for debugging
#- debug

View File

@ -0,0 +1,28 @@
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-cephfs
# Change "rook-ceph" provisioner prefix to match the operator namespace if needed
provisioner: rook-ceph.cephfs.csi.ceph.com
parameters:
# clusterID is the namespace where the rook cluster is running
# If you change this namespace, also change the namespace below where the secret namespaces are defined
clusterID: rook-ceph
# CephFS filesystem name into which the volume shall be created
fsName: myfs
# Ceph pool into which the volume shall be created
# Required for provisionVolume: "true"
pool: myfs-replicated
# The secrets contain Ceph admin credentials. These are generated automatically by the operator
# in the same namespace as the cluster.
csi.storage.k8s.io/provisioner-secret-name: rook-csi-cephfs-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-cephfs-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
csi.storage.k8s.io/node-stage-secret-name: rook-csi-cephfs-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
reclaimPolicy: Delete

View File

@ -0,0 +1,94 @@
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: replicapool
namespace: rook-ceph # namespace:cluster
spec:
failureDomain: host
replicated:
size: 3
# Disallow setting pool with replica 1, this could lead to data loss without recovery.
# Make sure you're *ABSOLUTELY CERTAIN* that is what you want
requireSafeReplicaSize: true
# gives a hint (%) to Ceph in terms of expected consumption of the total cluster capacity of a given pool
# for more info: https://docs.ceph.com/docs/master/rados/operations/placement-groups/#specifying-expected-pool-size
#targetSizeRatio: .5
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-ceph-block
annotations:
storageclass.kubernetes.io/is-default-class: 'true'
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
# clusterID is the namespace where the rook cluster is running
# If you change this namespace, also change the namespace below where the secret namespaces are defined
clusterID: rook-ceph-external # namespace:cluster
# If you want to use erasure coded pool with RBD, you need to create
# two pools. one erasure coded and one replicated.
# You need to specify the replicated pool here in the `pool` parameter, it is
# used for the metadata of the images.
# The erasure coded pool must be set as the `dataPool` parameter below.
#dataPool: ec-data-pool
pool: k8sBlockStorage
# (optional) mapOptions is a comma-separated list of map options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# mapOptions: lock_on_read,queue_depth=1024
# (optional) unmapOptions is a comma-separated list of unmap options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# unmapOptions: force
# (optional) Set it to true to encrypt each volume with encryption keys
# from a key management system (KMS)
# encrypted: "true"
# (optional) Use external key management system (KMS) for encryption key by
# specifying a unique ID matching a KMS ConfigMap. The ID is only used for
# correlation to configmap entry.
# encryptionKMSID: <kms-config-id>
# RBD image format. Defaults to "2".
imageFormat: "2"
# RBD image features
# Available for imageFormat: "2". Older releases of CSI RBD
# support only the `layering` feature. The Linux kernel (KRBD) supports the
# full complement of features as of 5.4
# `layering` alone corresponds to Ceph's bitfield value of "2" ;
# `layering` + `fast-diff` + `object-map` + `deep-flatten` + `exclusive-lock` together
# correspond to Ceph's OR'd bitfield value of "63". Here we use
# a symbolic, comma-separated format:
# For 5.4 or later kernels:
#imageFeatures: layering,fast-diff,object-map,deep-flatten,exclusive-lock
# For 5.3 or earlier kernels:
imageFeatures: layering
# The secrets contain Ceph admin credentials. These are generated automatically by the operator
# in the same namespace as the cluster.
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph-external # namespace:cluster
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph-external # namespace:cluster
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph-external # namespace:cluster
# Specify the filesystem type of the volume. If not specified, csi-provisioner
# will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
# in hyperconverged settings where the volume is mounted on the same node as the osds.
csi.storage.k8s.io/fstype: ext4
# uncomment the following to use rbd-nbd as mounter on supported nodes
# **IMPORTANT**: CephCSI v3.4.0 onwards a volume healer functionality is added to reattach
# the PVC to application pod if nodeplugin pod restart.
# Its still in Alpha support. Therefore, this option is not recommended for production use.
#mounter: rbd-nbd
allowVolumeExpansion: true
reclaimPolicy: Delete

View File

@ -0,0 +1,71 @@
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: replicapool
namespace: rook-ceph
spec:
failureDomain: host
replicated:
size: 1
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-ceph-block
# Change "rook-ceph" provisioner prefix to match the operator namespace if needed
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
# clusterID is the namespace where the rook cluster is running
clusterID: rook-ceph
# Ceph pool into which the RBD image shall be created
pool: replicapool
# (optional) mapOptions is a comma-separated list of map options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# mapOptions: lock_on_read,queue_depth=1024
# (optional) unmapOptions is a comma-separated list of unmap options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# unmapOptions: force
# RBD image format. Defaults to "2".
imageFormat: "2"
# RBD image features
# Available for imageFormat: "2". Older releases of CSI RBD
# support only the `layering` feature. The Linux kernel (KRBD) supports the
# full complement of features as of 5.4
# `layering` alone corresponds to Ceph's bitfield value of "2" ;
# `layering` + `fast-diff` + `object-map` + `deep-flatten` + `exclusive-lock` together
# correspond to Ceph's OR'd bitfield value of "63". Here we use
# a symbolic, comma-separated format:
# For 5.4 or later kernels:
#imageFeatures: layering,fast-diff,object-map,deep-flatten,exclusive-lock
# For 5.3 or earlier kernels:
imageFeatures: layering
# The secrets contain Ceph admin credentials.
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
# Specify the filesystem type of the volume. If not specified, csi-provisioner
# will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
# in hyperconverged settings where the volume is mounted on the same node as the osds.
csi.storage.k8s.io/fstype: ext4
# Delete the rbd volume when a PVC is deleted
reclaimPolicy: Delete
# Optional, if you want to add dynamic resize for PVC.
# For now only ext3, ext4, xfs resize support provided, like in Kubernetes itself.
allowVolumeExpansion: true

View File

@ -0,0 +1,93 @@
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: replicapool
namespace: rook-ceph # namespace:cluster
spec:
failureDomain: host
replicated:
size: 3
# Disallow setting pool with replica 1, this could lead to data loss without recovery.
# Make sure you're *ABSOLUTELY CERTAIN* that is what you want
requireSafeReplicaSize: true
# gives a hint (%) to Ceph in terms of expected consumption of the total cluster capacity of a given pool
# for more info: https://docs.ceph.com/docs/master/rados/operations/placement-groups/#specifying-expected-pool-size
#targetSizeRatio: .5
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-ceph-block
# Change "rook-ceph" provisioner prefix to match the operator namespace if needed
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
# clusterID is the namespace where the rook cluster is running
# If you change this namespace, also change the namespace below where the secret namespaces are defined
clusterID: rook-ceph # namespace:cluster
# If you want to use erasure coded pool with RBD, you need to create
# two pools. one erasure coded and one replicated.
# You need to specify the replicated pool here in the `pool` parameter, it is
# used for the metadata of the images.
# The erasure coded pool must be set as the `dataPool` parameter below.
#dataPool: ec-data-pool
pool: replicapool
# (optional) mapOptions is a comma-separated list of map options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# mapOptions: lock_on_read,queue_depth=1024
# (optional) unmapOptions is a comma-separated list of unmap options.
# For krbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd/#kernel-rbd-krbd-options
# For nbd options refer
# https://docs.ceph.com/docs/master/man/8/rbd-nbd/#options
# unmapOptions: force
# (optional) Set it to true to encrypt each volume with encryption keys
# from a key management system (KMS)
# encrypted: "true"
# (optional) Use external key management system (KMS) for encryption key by
# specifying a unique ID matching a KMS ConfigMap. The ID is only used for
# correlation to configmap entry.
# encryptionKMSID: <kms-config-id>
# RBD image format. Defaults to "2".
imageFormat: "2"
# RBD image features
# Available for imageFormat: "2". Older releases of CSI RBD
# support only the `layering` feature. The Linux kernel (KRBD) supports the
# full complement of features as of 5.4
# `layering` alone corresponds to Ceph's bitfield value of "2" ;
# `layering` + `fast-diff` + `object-map` + `deep-flatten` + `exclusive-lock` together
# correspond to Ceph's OR'd bitfield value of "63". Here we use
# a symbolic, comma-separated format:
# For 5.4 or later kernels:
#imageFeatures: layering,fast-diff,object-map,deep-flatten,exclusive-lock
# For 5.3 or earlier kernels:
imageFeatures: layering
# The secrets contain Ceph admin credentials. These are generated automatically by the operator
# in the same namespace as the cluster.
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph # namespace:cluster
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph # namespace:cluster
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph # namespace:cluster
# Specify the filesystem type of the volume. If not specified, csi-provisioner
# will set default as `ext4`. Note that `xfs` is not recommended due to potential deadlock
# in hyperconverged settings where the volume is mounted on the same node as the osds.
csi.storage.k8s.io/fstype: ext4
# uncomment the following to use rbd-nbd as mounter on supported nodes
# **IMPORTANT**: CephCSI v3.4.0 onwards a volume healer functionality is added to reattach
# the PVC to application pod if nodeplugin pod restart.
# Its still in Alpha support. Therefore, this option is not recommended for production use.
#mounter: rbd-nbd
allowVolumeExpansion: true
reclaimPolicy: Delete

View File

@ -0,0 +1,22 @@
#################################################################################################################
# Create a filesystem with settings for a test environment where only a single OSD is required.
# kubectl create -f filesystem-test.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephFilesystem
metadata:
name: myfs
namespace: rook-ceph
spec:
metadataPool:
replicated:
size: 1
dataPools:
- name: replicated
replicated:
size: 1
preserveFilesystemOnDelete: true
metadataServer:
activeCount: 1
activeStandby: true

View File

@ -0,0 +1,138 @@
#################################################################################################################
# Create a filesystem with settings with replication enabled for a production environment.
# A minimum of 3 OSDs on different nodes are required in this example.
# If one mds daemon per node is too restrictive, see the podAntiAffinity below.
# kubectl create -f filesystem.yaml
#################################################################################################################
apiVersion: ceph.rook.io/v1
kind: CephFilesystem
metadata:
name: myfs
namespace: rook-ceph # namespace:cluster
spec:
# The metadata pool spec. Must use replication.
metadataPool:
replicated:
size: 3
requireSafeReplicaSize: true
parameters:
# Inline compression mode for the data pool
# Further reference: https://docs.ceph.com/docs/master/rados/configuration/bluestore-config-ref/#inline-compression
compression_mode:
none
# gives a hint (%) to Ceph in terms of expected consumption of the total cluster capacity of a given pool
# for more info: https://docs.ceph.com/docs/master/rados/operations/placement-groups/#specifying-expected-pool-size
#target_size_ratio: ".5"
# The list of data pool specs. Can use replication or erasure coding.
dataPools:
- name: replicated
failureDomain: host
replicated:
size: 3
# Disallow setting pool with replica 1, this could lead to data loss without recovery.
# Make sure you're *ABSOLUTELY CERTAIN* that is what you want
requireSafeReplicaSize: true
parameters:
# Inline compression mode for the data pool
# Further reference: https://docs.ceph.com/docs/master/rados/configuration/bluestore-config-ref/#inline-compression
compression_mode:
none
# gives a hint (%) to Ceph in terms of expected consumption of the total cluster capacity of a given pool
# for more info: https://docs.ceph.com/docs/master/rados/operations/placement-groups/#specifying-expected-pool-size
#target_size_ratio: ".5"
# Whether to preserve filesystem after CephFilesystem CRD deletion
preserveFilesystemOnDelete: true
# The metadata service (mds) configuration
metadataServer:
# The number of active MDS instances
activeCount: 1
# Whether each active MDS instance will have an active standby with a warm metadata cache for faster failover.
# If false, standbys will be available, but will not have a warm cache.
activeStandby: true
# The affinity rules to apply to the mds deployment
placement:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: role
# operator: In
# values:
# - mds-node
# topologySpreadConstraints:
# tolerations:
# - key: mds-node
# operator: Exists
# podAffinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- rook-ceph-mds
## Add this if you want to allow mds daemons for different filesystems to run on one
## node. The value in "values" must match .metadata.name.
# - key: rook_file_system
# operator: In
# values:
# - myfs
# topologyKey: kubernetes.io/hostname will place MDS across different hosts
topologyKey: kubernetes.io/hostname
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- rook-ceph-mds
# topologyKey: */zone can be used to spread MDS across different AZ
# Use <topologyKey: failure-domain.beta.kubernetes.io/zone> in k8s cluster if your cluster is v1.16 or lower
# Use <topologyKey: topology.kubernetes.io/zone> in k8s cluster is v1.17 or upper
topologyKey: topology.kubernetes.io/zone
# A key/value list of annotations
# annotations:
# key: value
# A key/value list of labels
# labels:
# key: value
# resources:
# The requests and limits set here, allow the filesystem MDS Pod(s) to use half of one CPU core and 1 gigabyte of memory
# limits:
# cpu: "500m"
# memory: "1024Mi"
# requests:
# cpu: "500m"
# memory: "1024Mi"
priorityClassName: system-cluster-critical
livenessProbe:
disabled: false
startupProbe:
disabled: false
# Filesystem mirroring settings
# mirroring:
# enabled: true
# list of Kubernetes Secrets containing the peer token
# for more details see: https://docs.ceph.com/en/latest/dev/cephfs-mirroring/#bootstrap-peers
# Add the secret name if it already exists else specify the empty list here.
# peers:
#secretNames:
#- secondary-cluster-peer
# specify the schedule(s) on which snapshots should be taken
# see the official syntax here https://docs.ceph.com/en/latest/cephfs/snap-schedule/#add-and-remove-schedules
# snapshotSchedules:
# - path: /
# interval: 24h # daily snapshots
# The startTime should be mentioned in the format YYYY-MM-DDTHH:MM:SS
# If startTime is not specified, then by default the start time is considered as midnight UTC.
# see usage here https://docs.ceph.com/en/latest/cephfs/snap-schedule/#usage
# startTime: 2022-07-15T11:55:00
# manage retention policies
# see syntax duration here https://docs.ceph.com/en/latest/cephfs/snap-schedule/#add-and-remove-retention-policies
# snapshotRetention:
# - path: /
# duration: "h 24"

View File

@ -0,0 +1,23 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: csi-metrics
namespace: rook-ceph
labels:
team: rook
spec:
namespaceSelector:
matchNames:
- rook-ceph
selector:
matchLabels:
app: csi-metrics
endpoints:
- port: csi-http-metrics
path: /metrics
interval: 5s
# comment csi-grpc-metrics related information if csi grpc metrics is not enabled
- port: csi-grpc-metrics
path: /metrics
interval: 5s

View File

@ -0,0 +1,35 @@
# Copied from /deploy/charts/rook-ceph-cluster/prometheus/, CR header added, and indentation increased on the groups
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: rook-prometheus
role: alert-rules
name: prometheus-ceph-rules
namespace: rook-ceph
spec:
groups:
- name: persistent-volume-alert.rules
rules:
- alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%. Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
for: 5s
labels:
severity: warning
- alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%. Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
for: 5s
labels:
severity: critical

View File

@ -0,0 +1,19 @@
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: rgw-scale
namespace: rook-ceph
spec:
scaleTargetRef:
kind: Deployment
name: rook-ceph-rgw-my-store-a
minReplicaCount: 1
maxReplicaCount: 5
triggers:
- type: prometheus
metadata:
serverAddress: http://rook-prometheus.rook-ceph.svc:9090
metricName: ceph_rgw_put_collector
query: |
sum(rate(ceph_rgw_put[2m]))
threshold: "90"

View File

@ -0,0 +1,846 @@
# Copied from /deploy/charts/rook-ceph-cluster/prometheus/, CR header added, and indentation increased on the groups
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: rook-prometheus
role: alert-rules
name: prometheus-ceph-rules
namespace: rook-ceph
spec:
groups:
- name: cluster health
rules:
- alert: CephHealthError
expr: ceph_health_status == 2
for: 5m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.2.1
annotations:
summary: Cluster is in the ERROR state
description: >
The cluster state has been HEALTH_ERROR for more than 5 minutes.
Please check "ceph health detail" for more information.
- alert: CephHealthWarning
expr: ceph_health_status == 1
for: 15m
labels:
severity: warning
type: ceph_default
annotations:
summary: Cluster is in the WARNING state
description: >
The cluster state has been HEALTH_WARN for more than 15 minutes.
Please check "ceph health detail" for more information.
- name: mon
rules:
- alert: CephMonDownQuorumAtRisk
expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
for: 30s
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.3.1
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
summary: Monitor quorum is at risk
description: |
{{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
Without quorum the cluster will become inoperable, affecting all services and connected clients.
The following monitors are down:
{{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
{{- end }}
- alert: CephMonDown
expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
for: 30s
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
summary: One or more monitors down
description: |
{{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}There are {{ $down }} monitor{{ $s }} down.
Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.
The following monitors are down:
{{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
{{- end }}
- alert: CephMonDiskspaceCritical
expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.3.2
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
summary: Filesystem space on at least one monitor is critically low
description: |
The free space available to a monitor's store is critically low.
You should increase the space available to the monitor(s). The default directory
is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and under
/var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook.
Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files.
Also check any other directories under /var/lib/rook and other directories on the
same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are;
{{- range query "ceph_mon_metadata"}}
- {{ .Labels.hostname }}
{{- end }}
- alert: CephMonDiskspaceLow
expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
summary: Disk space on at least one monitor is approaching full
description: |
The space available to a monitor's store is approaching full (>70% is the default).
You should increase the space available to the monitor(s). The default directory
is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and under
/var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook.
Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files.
Also check any other directories under /var/lib/rook and other directories on the
same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are;
{{- range query "ceph_mon_metadata"}}
- {{ .Labels.hostname }}
{{- end }}
- alert: CephMonClockSkew
expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
summary: Clock skew detected among monitors
description: |
Ceph monitors rely on closely synchronized time to maintain
quorum and cluster consistency. This event indicates that time on at least
one mon has drifted too far from the lead mon.
Review cluster status with ceph -s. This will show which monitors
are affected. Check the time sync status on each monitor host with
"ceph time-sync-status" and the state and peers of your ntpd or chrony daemon.
- name: osd
rules:
- alert: CephOSDDownHigh
expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.1
annotations:
summary: More than 10% of OSDs are down
description: |
{{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%).
The following OSDs are down:
{{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
{{- end }}
- alert: CephOSDHostDown
expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.8
annotations:
summary: An OSD host is offline
description: |
The following OSDs are down:
{{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
- {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
{{- end }}
- alert: CephOSDDown
expr: ceph_health_detail{name="OSD_DOWN"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.2
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
summary: An OSD has been marked down
description: |
{{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
{{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
{{- end }}
- alert: CephOSDNearFull
expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.3
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
summary: OSD(s) running low on free space (NEARFULL)
description: |
One or more OSDs have reached the NEARFULL threshold
Use 'ceph health detail' and 'ceph osd df' to identify the problem.
To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
- alert: CephOSDFull
expr: ceph_health_detail{name="OSD_FULL"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.6
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
summary: OSD full, writes blocked
description: |
An OSD has reached the FULL threshold. Writes to pools that share the
affected OSD will be blocked.
Use 'ceph health detail' and 'ceph osd df' to identify the problem.
To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
- alert: CephOSDBackfillFull
expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
summary: OSD(s) too full for backfill operations
description: |
An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations
from completing.
Use 'ceph health detail' and 'ceph osd df' to identify the problem.
To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
- alert: CephOSDTooManyRepairs
expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
for: 30s
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
summary: OSD reports a high number of read errors
description: |
Reads from an OSD have used a secondary PG to return data to the client, indicating
a potential failing disk.
- alert: CephOSDTimeoutsPublicNetwork
expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
summary: Network issues delaying OSD heartbeats (public network)
description: |
OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
for latency or loss issues. Use 'ceph health detail' to show the affected OSDs.
- alert: CephOSDTimeoutsClusterNetwork
expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
summary: Network issues delaying OSD heartbeats (cluster network)
description: |
OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
for latency or loss issues. Use 'ceph health detail' to show the affected OSDs.
- alert: CephOSDInternalDiskSizeMismatch
expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
summary: OSD size inconsistency error
description: |
One or more OSDs have an internal inconsistency between metadata and the size of the device.
This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs.
- alert: CephDeviceFailurePredicted
expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
summary: Device(s) predicted to fail soon
description: |
The device health module has determined that one or more devices will fail
soon. To review device status use 'ceph device ls'. To show a specific
device use 'ceph device info <dev id>'.
Mark the OSD out so that data may migrate to other OSDs. Once
the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD.
- alert: CephDeviceFailurePredictionTooHigh
expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.7
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
summary: Too many devices are predicted to fail, unable to resolve
description: |
The device health module has determined that devices predicted to
fail can not be remediated automatically, since too many OSDs would be removed from the
cluster to ensure performance and availabililty. Prevent data
integrity issues by adding new OSDs so that data may be relocated.
- alert: CephDeviceFailureRelocationIncomplete
expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
summary: Device failure is predicted, but unable to relocate data
description: |
The device health module has determined that one or more devices will fail
soon, but the normal process of relocating the data on the device to other
OSDs in the cluster is blocked.
Ensure that the cluster has available free space. It may be necessary to add
capacity to the cluster to allow the data from the failing device to
successfully migrate, or to enable the balancer.
- alert: CephOSDFlapping
expr: |
(
rate(ceph_osd_up[5m])
* on(ceph_daemon) group_left(hostname) ceph_osd_metadata
) * 60 > 1
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.4
annotations:
documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
summary: Network issues are causing OSDs to flap (mark each other down)
description: >
OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
marked down and back up {{ $value | humanize }} times once a
minute for 5 minutes. This may indicate a network issue (latency,
packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network
is deployed. Check network stats on the listed host(s).
- alert: CephOSDReadErrors
expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
for: 30s
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
summary: Device read errors detected
description: >
An OSD has encountered read errors, but the OSD has recovered by retrying
the reads. This may indicate an issue with hardware or the kernel.
# alert on high deviation from average PG count
- alert: CephPGImbalance
expr: |
abs(
(
(ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
for: 5m
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.4.5
annotations:
summary: PGs are not balanced across OSDs
description: >
OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
by more than 30% from average PG count.
# alert on high commit latency...but how high is too high
- name: mds
rules:
- alert: CephFilesystemDamaged
expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.5.1
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
summary: CephFS filesystem is damaged.
description: >
Filesystem metadata has been corrupted. Data may be inaccessible.
Analyze metrics from the MDS daemon admin socket, or
escalate to support.
- alert: CephFilesystemOffline
expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.5.3
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
summary: CephFS filesystem is offline
description: >
All MDS ranks are unavailable. The MDS daemons managing metadata
are down, rendering the filesystem offline.
- alert: CephFilesystemDegraded
expr: ceph_health_detail{name="FS_DEGRADED"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.5.4
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
summary: CephFS filesystem is degraded
description: >
One or more metadata daemons (MDS ranks) are failed or in a
damaged state. At best the filesystem is partially available,
at worst the filesystem is completely unusable.
- alert: CephFilesystemMDSRanksLow
expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
summary: MDS daemon count is lower than configured
description: >
The filesystem's "max_mds" setting defines the number of MDS ranks in
the filesystem. The current number of active MDS daemons is less than
this value.
- alert: CephFilesystemInsufficientStandby
expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
summary: Ceph filesystem standby daemons too few
description: >
The minimum number of standby daemons required by standby_count_wanted
is less than the current number of standby daemons. Adjust the standby count
or increase the number of MDS daemons.
- alert: CephFilesystemFailureNoStandby
expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.5.5
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
summary: MDS daemon failed, no further standby available
description: >
An MDS daemon has failed, leaving only one active rank and no
available standby. Investigate the cause of the failure or add a
standby MDS.
- alert: CephFilesystemReadOnly
expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.5.2
annotations:
documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
summary: CephFS filesystem in read only mode due to write error(s)
description: >
The filesystem has switched to READ ONLY due to an unexpected
error when writing to the metadata pool.
Analyze the output from the MDS daemon admin socket, or
escalate to support.
- name: mgr
rules:
- alert: CephMgrModuleCrash
expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
for: 5m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.6.1
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
summary: A manager module has recently crashed
description: >
One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A
crashed module may impact functionality within the cluster. Use the 'ceph crash' command to
determine which module has failed, and archive it to acknowledge the failure.
- alert: CephMgrPrometheusModuleInactive
expr: up{job="ceph"} == 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.6.2
annotations:
summary: The mgr/prometheus module is not available
description: >
The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
could mean that the module has been disabled or the mgr daemon itself is down.
Without the mgr/prometheus module metrics and alerts will no longer
function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the
mgr is active. If the mgr is not active, restart it, otherwise you can determine
the mgr/prometheus module status with 'ceph mgr module ls'. If it is
not listed as enabled, enable it with 'ceph mgr module enable prometheus'.
- name: pgs
rules:
- alert: CephPGsInactive
expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
for: 5m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.1
annotations:
summary: One or more placement groups are inactive
description: >
{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
Inactive placement groups are not able to serve read/write requests.
- alert: CephPGsUnclean
expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
for: 15m
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.2
annotations:
summary: One or more placement groups are marked unclean
description: >
{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}.
Unclean PGs have not recovered from a previous failure.
- alert: CephPGsDamaged
expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
for: 5m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.4
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
summary: Placement group damaged; manual intervention needed
description: >
Scrubs have flagged at least one PG as damaged or inconsistent.
Check to see which PG is affected, and attempt a manual repair if necessary. To list
problematic placement groups, use 'ceph health detail' or 'rados list-inconsistent-pg <pool>'. To repair PGs use
the 'ceph pg repair <pg_num>' command.
- alert: CephPGRecoveryAtRisk
expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.5
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
summary: OSDs are too full for recovery
description: >
Data redundancy is at risk since one or more OSDs are at or above the
'full' threshold. Add capacity to the cluster, restore down/out OSDs, or delete unwanted data.
- alert: CephPGUnavailableBlockingIO
# PG_AVAILABILITY, but an OSD is not in a DOWN state
expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.3
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
summary: PG is unavailable, blocking I/O
description: >
Data availability is reduced, impacting the cluster's ability to service I/O. One or
more placement groups (PGs) are in a state that blocks I/O.
- alert: CephPGBackfillAtRisk
expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.7.6
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
summary: Backfill operations are blocked due to lack of free space
description: >
Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
- alert: CephPGNotScrubbed
expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
summary: Placement group(s) have not been scrubbed
description: |
One or more PGs have not been scrubbed recently. Scrubs check metadata integrity,
protecting against bit-rot. They check that metadata
is consistent across data replicas. When PGs miss their scrub interval, it may
indicate that the scrub window is too small, or PGs were not in a 'clean' state during the
scrub window.
You can manually initiate a scrub with: ceph pg scrub <pgid>
- alert: CephPGsHighPerOSD
expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
for: 1m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
summary: Placement groups per OSD is too high
description: |
The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status',
and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
the autoscaler based on the expected relative size of the pool
('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler
mode to "warn" and adjust pg_num appropriately for one or more pools.
- alert: CephPGNotDeepScrubbed
expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
for: 5m
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
summary: Placement group(s) have not been deep scrubbed
description: |
One or more PGs have not been deep scrubbed recently. Deep scrubs
protect against bit-rot. They compare data
replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate
that the window is too small or PGs were not in a 'clean' state during the deep-scrub
window.
You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
- name: nodes
rules:
- alert: CephNodeRootFilesystemFull
expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
for: 5m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.8.1
annotations:
summary: Root filesystem is dangerously full
description: >
Root volume is dangerously full: {{ $value | humanize }}% free.
# alert on packet errors and drop rate
- alert: CephNodeNetworkPacketDrops
expr: |
(
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.8.2
annotations:
summary: One or more NICs reports packet drops
description: >
Node {{ $labels.instance }} experiences packet drop > 0.01% or >
10 packets/s on interface {{ $labels.device }}.
- alert: CephNodeNetworkPacketErrors
expr: |
(
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.8.3
annotations:
summary: One or more NICs reports packet errors
description: >
Node {{ $labels.instance }} experiences packet errors > 0.01% or
> 10 packets/s on interface {{ $labels.device }}.
# Restrict to device names beginning with '/' to skip false alarms from
# tmpfs, overlay type filesystems
- alert: CephNodeDiskspaceWarning
expr: |
predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
on(instance) group_left(nodename) node_uname_info < 0
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.8.4
annotations:
summary: Host filesystem free space is low
description: >
Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
will be full in less than 5 days based on the 48 hour trailing
fill rate.
- alert: CephNodeInconsistentMTU
expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
labels:
severity: warning
type: ceph_default
annotations:
summary: MTU settings across hosts are inconsistent
description: >
Node {{ $labels.instance }} has a different MTU size ({{ $value }})
than the median value on device {{ $labels.device }}.
- name: pools
rules:
- alert: CephPoolGrowthWarning
expr: |
(predict_linear((max(ceph_pool_percent_used) without (pod, instance))[2d:1h], 3600 * 24 * 5) * on(pool_id)
group_right ceph_pool_metadata) >= 95
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.9.2
annotations:
summary: Pool growth rate may soon exceed capacity
description: >
Pool '{{ $labels.name }}' will be full in less than 5 days
assuming the average fill-up rate of the past 48 hours.
- alert: CephPoolBackfillFull
expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
labels:
severity: warning
type: ceph_default
annotations:
summary: Free space in a pool is too low for recovery/backfill
description: >
A pool is approaching the near full threshold, which will
prevent recovery/backfill from completing.
Consider adding more capacity.
- alert: CephPoolFull
expr: ceph_health_detail{name="POOL_FULL"} > 0
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.9.1
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
summary: Pool is full - writes are blocked
description: |
A pool has reached its MAX quota, or OSDs supporting the pool
have reached the FULL threshold. Until this is resolved, writes to
the pool will be blocked.
Pool Breakdown (top 5)
{{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }}
- {{ .Labels.name }} at {{ .Value }}%
{{- end }}
Increase the pool's quota, or add capacity to the cluster
then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
- alert: CephPoolNearFull
expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
for: 5m
labels:
severity: warning
type: ceph_default
annotations:
summary: One or more Ceph pools are nearly full
description: |
A pool has exceeded the warning (percent full) threshold, or OSDs
supporting the pool have reached the NEARFULL threshold. Writes may
continue, but you are at risk of the pool going read-only if more capacity
isn't made available.
Determine the affected pool with 'ceph df detail', looking
at QUOTA BYTES and STORED. Increase the pool's quota, or add
capacity to the cluster then increase the pool's quota
(e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>).
Also ensure that the balancer is active.
- name: healthchecks
rules:
- alert: CephSlowOps
expr: ceph_healthcheck_slow_ops > 0
for: 30s
labels:
severity: warning
type: ceph_default
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
summary: OSD operations are slow to complete
description: >
{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
# Object related events
- name: rados
rules:
- alert: CephObjectMissing
expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
for: 30s
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.10.1
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
summary: Object(s) marked UNFOUND
description: |
The latest version of a RADOS object can not be found, even though all OSDs are up. I/O
requests for this object from clients will block (hang). Resolving this issue may
require the object to be rolled back to a prior version manually, and manually verified.
# Generic
- name: generic
rules:
- alert: CephDaemonCrash
expr: ceph_health_detail{name="RECENT_CRASH"} == 1
for: 1m
labels:
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.1.2
annotations:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
summary: One or more Ceph daemons have crashed, and are pending acknowledgement
description: |
One or more daemons have crashed recently, and need to be acknowledged. This notification
ensures that software crashes do not go unseen. To acknowledge a crash, use the
'ceph crash archive <id>' command.

View File

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: rook-prometheus
namespace: rook-ceph
spec:
type: NodePort
ports:
- name: web
nodePort: 30900
port: 9090
protocol: TCP
targetPort: web
selector:
prometheus: rook-prometheus

View File

@ -0,0 +1,69 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: rook-ceph
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
aggregationRule:
clusterRoleSelectors:
- matchLabels:
rbac.ceph.rook.io/aggregate-to-prometheus: "true"
rules: []
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus-rules
labels:
rbac.ceph.rook.io/aggregate-to-prometheus: "true"
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: rook-ceph
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: rook-prometheus
namespace: rook-ceph
labels:
prometheus: rook-prometheus
spec:
serviceAccountName: prometheus
serviceMonitorSelector:
matchLabels:
team: rook
ruleSelector:
matchLabels:
role: alert-rules
prometheus: rook-prometheus
resources:
requests:
memory: 400Mi

View File

@ -0,0 +1,113 @@
---
# OLM: BEGIN ROLE
# Aspects for creation of monitoring resources
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-monitor
namespace: rook-ceph
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
verbs:
- get
- list
- watch
- create
- update
- delete
# OLM: END ROLE
---
# OLM: BEGIN ROLE BINDING
# Allow creation of monitoring resources
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-monitor
namespace: rook-ceph
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: rook-ceph-monitor
subjects:
- kind: ServiceAccount
name: rook-ceph-system
namespace: rook-ceph
# OLM: END ROLE BINDING
---
# OLM: BEGIN ROLE
# Aspects for metrics collection
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-metrics
namespace: rook-ceph
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
# OLM: END ROLE
---
# OLM: BEGIN ROLE BINDING
# Allow collection of metrics
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-metrics
namespace: rook-ceph
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: rook-ceph-metrics
subjects:
- kind: ServiceAccount
# change to the serviceaccount and namespace to use for monitoring
name: prometheus-k8s
namespace: rook-ceph
# OLM: END ROLE BINDING
---
# OLM: BEGIN ROLE
# Allow management of monitoring resources in the mgr
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-monitor-mgr
namespace: rook-ceph
rules:
- apiGroups:
- monitoring.coreos.com
resources:
- servicemonitors
verbs:
- get
- list
- create
- update
# OLM: END ROLE
---
# OLM: BEGIN ROLE BINDING
# Allow creation of monitoring resources in the mgr
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: rook-ceph-monitor-mgr
namespace: rook-ceph
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: rook-ceph-monitor-mgr
subjects:
- kind: ServiceAccount
name: rook-ceph-mgr
namespace: rook-ceph
# OLM: END ROLE BINDING
---

View File

@ -0,0 +1,20 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: rook-ceph-mgr
namespace: rook-ceph
labels:
team: rook
spec:
namespaceSelector:
matchNames:
- rook-ceph
selector:
matchLabels:
app: rook-ceph-mgr
rook_cluster: rook-ceph
ceph_daemon_id: a
endpoints:
- port: http-metrics
path: /metrics
interval: 5s

View File

@ -0,0 +1,694 @@
#################################################################################################################
# The deployment for the rook operator
# Contains the common settings for most Kubernetes deployments.
# For example, to create the rook-ceph cluster:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster.yaml
#
# Also see other operator sample files for variations of operator.yaml:
# - operator-openshift.yaml: Common settings for running in OpenShift
###############################################################################################################
# Rook Ceph Operator Config ConfigMap
# Use this ConfigMap to override Rook-Ceph Operator configurations.
# NOTE! Precedence will be given to this config if the same Env Var config also exists in the
# Operator Deployment.
# To move a configuration(s) from the Operator Deployment to this ConfigMap, add the config
# here. It is recommended to then remove it from the Deployment to eliminate any future confusion.
kind: ConfigMap
apiVersion: v1
metadata:
name: rook-ceph-operator-config
# should be in the namespace of the operator
namespace: rook-ceph # namespace:operator
data:
# The logging level for the operator: ERROR | WARNING | INFO | DEBUG
ROOK_LOG_LEVEL: "INFO"
# Allow using loop devices for osds in test clusters.
ROOK_CEPH_ALLOW_LOOP_DEVICES: "false"
# Enable the CSI driver.
# To run the non-default version of the CSI driver, see the override-able image properties in operator.yaml
ROOK_CSI_ENABLE_CEPHFS: "true"
# Enable the default version of the CSI RBD driver. To start another version of the CSI driver, see image properties below.
ROOK_CSI_ENABLE_RBD: "true"
# Enable the CSI NFS driver. To start another version of the CSI driver, see image properties below.
ROOK_CSI_ENABLE_NFS: "false"
ROOK_CSI_ENABLE_GRPC_METRICS: "false"
# Set to true to enable Ceph CSI pvc encryption support.
CSI_ENABLE_ENCRYPTION: "false"
# Set to true to enable host networking for CSI CephFS and RBD nodeplugins. This may be necessary
# in some network configurations where the SDN does not provide access to an external cluster or
# there is significant drop in read/write performance.
# CSI_ENABLE_HOST_NETWORK: "true"
# Set to true to enable adding volume metadata on the CephFS subvolume and RBD images.
# Not all users might be interested in getting volume/snapshot details as metadata on CephFS subvolume and RBD images.
# Hence enable metadata is false by default.
# CSI_ENABLE_METADATA: "true"
# cluster name identifier to set as metadata on the CephFS subvolume and RBD images. This will be useful in cases
# like for example, when two container orchestrator clusters (Kubernetes/OCP) are using a single ceph cluster.
# CSI_CLUSTER_NAME: "my-prod-cluster"
# Set logging level for cephCSI containers maintained by the cephCSI.
# Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity.
# CSI_LOG_LEVEL: "0"
# Set logging level for Kubernetes-csi sidecar containers.
# Supported values from 0 to 5. 0 for general useful logs (the default), 5 for trace level verbosity.
# CSI_SIDECAR_LOG_LEVEL: "0"
# Set replicas for csi provisioner deployment.
CSI_PROVISIONER_REPLICAS: "2"
# OMAP generator will generate the omap mapping between the PV name and the RBD image.
# CSI_ENABLE_OMAP_GENERATOR need to be enabled when we are using rbd mirroring feature.
# By default OMAP generator sidecar is deployed with CSI provisioner pod, to disable
# it set it to false.
# CSI_ENABLE_OMAP_GENERATOR: "false"
# set to false to disable deployment of snapshotter container in CephFS provisioner pod.
CSI_ENABLE_CEPHFS_SNAPSHOTTER: "true"
# set to false to disable deployment of snapshotter container in NFS provisioner pod.
CSI_ENABLE_NFS_SNAPSHOTTER: "true"
# set to false to disable deployment of snapshotter container in RBD provisioner pod.
CSI_ENABLE_RBD_SNAPSHOTTER: "true"
# Enable cephfs kernel driver instead of ceph-fuse.
# If you disable the kernel client, your application may be disrupted during upgrade.
# See the upgrade guide: https://rook.io/docs/rook/latest/ceph-upgrade.html
# NOTE! cephfs quota is not supported in kernel version < 4.17
CSI_FORCE_CEPHFS_KERNEL_CLIENT: "true"
# (Optional) policy for modifying a volume's ownership or permissions when the RBD PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_RBD_FSGROUPPOLICY: "File"
# (Optional) policy for modifying a volume's ownership or permissions when the CephFS PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_CEPHFS_FSGROUPPOLICY: "File"
# (Optional) policy for modifying a volume's ownership or permissions when the NFS PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_NFS_FSGROUPPOLICY: "File"
# (Optional) Allow starting unsupported ceph-csi image
ROOK_CSI_ALLOW_UNSUPPORTED_VERSION: "false"
# (Optional) control the host mount of /etc/selinux for csi plugin pods.
CSI_PLUGIN_ENABLE_SELINUX_HOST_MOUNT: "false"
# The default version of CSI supported by Rook will be started. To change the version
# of the CSI driver to something other than what is officially supported, change
# these images to the desired release of the CSI driver.
# ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.8.0"
# ROOK_CSI_REGISTRAR_IMAGE: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.7.0"
# ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.7.0"
# ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v3.4.0"
# ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.k8s.io/sig-storage/csi-snapshotter:v6.2.1"
# ROOK_CSI_ATTACHER_IMAGE: "registry.k8s.io/sig-storage/csi-attacher:v4.1.0"
# To indicate the image pull policy to be applied to all the containers in the csi driver pods.
# ROOK_CSI_IMAGE_PULL_POLICY: "IfNotPresent"
# (Optional) set user created priorityclassName for csi plugin pods.
CSI_PLUGIN_PRIORITY_CLASSNAME: "system-node-critical"
# (Optional) set user created priorityclassName for csi provisioner pods.
CSI_PROVISIONER_PRIORITY_CLASSNAME: "system-cluster-critical"
# CSI CephFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# CSI RBD plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_RBD_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# A maxUnavailable parameter of CSI RBD plugin daemonset update strategy.
# Default value is 1.
# CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE: "1"
# CSI NFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_NFS_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# kubelet directory path, if kubelet configured to use other than /var/lib/kubelet path.
# ROOK_CSI_KUBELET_DIR_PATH: "/var/lib/kubelet"
# Labels to add to the CSI CephFS Deployments and DaemonSets Pods.
# ROOK_CSI_CEPHFS_POD_LABELS: "key1=value1,key2=value2"
# Labels to add to the CSI RBD Deployments and DaemonSets Pods.
# ROOK_CSI_RBD_POD_LABELS: "key1=value1,key2=value2"
# Labels to add to the CSI NFS Deployments and DaemonSets Pods.
# ROOK_CSI_NFS_POD_LABELS: "key1=value1,key2=value2"
# (Optional) CephCSI CephFS plugin Volumes
# CSI_CEPHFS_PLUGIN_VOLUME: |
# - name: lib-modules
# hostPath:
# path: /run/current-system/kernel-modules/lib/modules/
# - name: host-nix
# hostPath:
# path: /nix
# (Optional) CephCSI CephFS plugin Volume mounts
# CSI_CEPHFS_PLUGIN_VOLUME_MOUNT: |
# - name: host-nix
# mountPath: /nix
# readOnly: true
# (Optional) CephCSI RBD plugin Volumes
# CSI_RBD_PLUGIN_VOLUME: |
# - name: lib-modules
# hostPath:
# path: /run/current-system/kernel-modules/lib/modules/
# - name: host-nix
# hostPath:
# path: /nix
# (Optional) CephCSI RBD plugin Volume mounts
# CSI_RBD_PLUGIN_VOLUME_MOUNT: |
# - name: host-nix
# mountPath: /nix
# readOnly: true
# (Optional) CephCSI provisioner NodeAffinity (applied to both CephFS and RBD provisioner).
# CSI_PROVISIONER_NODE_AFFINITY: "role=storage-node; storage=rook, ceph"
# (Optional) CephCSI provisioner tolerations list(applied to both CephFS and RBD provisioner).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_PROVISIONER_TOLERATIONS: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) CephCSI plugin NodeAffinity (applied to both CephFS and RBD plugin).
# CSI_PLUGIN_NODE_AFFINITY: "role=storage-node; storage=rook, ceph"
# (Optional) CephCSI plugin tolerations list(applied to both CephFS and RBD plugin).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_PLUGIN_TOLERATIONS: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) CephCSI RBD provisioner NodeAffinity (if specified, overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_RBD_PROVISIONER_NODE_AFFINITY: "role=rbd-node"
# (Optional) CephCSI RBD provisioner tolerations list(if specified, overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_RBD_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/rbd
# operator: Exists
# (Optional) CephCSI RBD plugin NodeAffinity (if specified, overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_RBD_PLUGIN_NODE_AFFINITY: "role=rbd-node"
# (Optional) CephCSI RBD plugin tolerations list(if specified, overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_RBD_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/rbd
# operator: Exists
# (Optional) CephCSI CephFS provisioner NodeAffinity (if specified, overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_CEPHFS_PROVISIONER_NODE_AFFINITY: "role=cephfs-node"
# (Optional) CephCSI CephFS provisioner tolerations list(if specified, overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_CEPHFS_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/cephfs
# operator: Exists
# (Optional) CephCSI CephFS plugin NodeAffinity (if specified, overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_CEPHFS_PLUGIN_NODE_AFFINITY: "role=cephfs-node"
# NOTE: Support for defining NodeAffinity for operators other than "In" and "Exists" requires the user to input a
# valid v1.NodeAffinity JSON or YAML string. For example, the following is valid YAML v1.NodeAffinity:
# CSI_CEPHFS_PLUGIN_NODE_AFFINITY: |
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: myKey
# operator: DoesNotExist
# (Optional) CephCSI CephFS plugin tolerations list(if specified, overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_CEPHFS_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/cephfs
# operator: Exists
# (Optional) CephCSI NFS provisioner NodeAffinity (overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_NFS_PROVISIONER_NODE_AFFINITY: "role=nfs-node"
# (Optional) CephCSI NFS provisioner tolerations list (overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_NFS_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/nfs
# operator: Exists
# (Optional) CephCSI NFS plugin NodeAffinity (overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_NFS_PLUGIN_NODE_AFFINITY: "role=nfs-node"
# (Optional) CephCSI NFS plugin tolerations list (overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_NFS_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/nfs
# operator: Exists
# (Optional) CEPH CSI RBD provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
#CSI_RBD_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-resizer
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-snapshotter
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-rbdplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : csi-omap-generator
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI RBD plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
#CSI_RBD_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-rbdplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI CephFS provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
#CSI_CEPHFS_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-resizer
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-snapshotter
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-cephfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI CephFS plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
#CSI_CEPHFS_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-cephfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI NFS provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
# CSI_NFS_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-nfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# (Optional) CEPH CSI NFS plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
# CSI_NFS_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-nfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# Configure CSI Ceph FS grpc and liveness metrics port
# Set to true to enable Ceph CSI liveness container.
CSI_ENABLE_LIVENESS: "false"
# CSI_CEPHFS_GRPC_METRICS_PORT: "9091"
# CSI_CEPHFS_LIVENESS_METRICS_PORT: "9081"
# Configure CSI RBD grpc and liveness metrics port
# CSI_RBD_GRPC_METRICS_PORT: "9090"
# CSI_RBD_LIVENESS_METRICS_PORT: "9080"
# CSIADDONS_PORT: "9070"
# Set CephFS Kernel mount options to use https://docs.ceph.com/en/latest/man/8/mount.ceph/#options
# Set to "ms_mode=secure" when connections.encrypted is enabled in CephCluster CR
# CSI_CEPHFS_KERNEL_MOUNT_OPTIONS: "ms_mode=secure"
# Whether the OBC provisioner should watch on the operator namespace or not, if not the namespace of the cluster will be used
ROOK_OBC_WATCH_OPERATOR_NAMESPACE: "true"
# Whether to start the discovery daemon to watch for raw storage devices on nodes in the cluster.
# This daemon does not need to run if you are only going to create your OSDs based on StorageClassDeviceSets with PVCs.
ROOK_ENABLE_DISCOVERY_DAEMON: "false"
# The timeout value (in seconds) of Ceph commands. It should be >= 1. If this variable is not set or is an invalid value, it's default to 15.
ROOK_CEPH_COMMANDS_TIMEOUT_SECONDS: "15"
# Enable the csi addons sidecar.
CSI_ENABLE_CSIADDONS: "false"
# ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.5.0"
# The CSI GRPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150.
CSI_GRPC_TIMEOUT_SECONDS: "150"
ROOK_DISABLE_ADMISSION_CONTROLLER: "true"
# Enable topology based provisioning.
CSI_ENABLE_TOPOLOGY: "false"
# Domain labels define which node labels to use as domains
# for CSI nodeplugins to advertise their domains
# NOTE: the value here serves as an example and needs to be
# updated with node labels that define domains of interest
# CSI_TOPOLOGY_DOMAIN_LABELS: "kubernetes.io/hostname,topology.kubernetes.io/zone,topology.rook.io/rack"
# Enable read affinity for RBD volumes. Recommended to
# set to true if running kernel 5.8 or newer.
CSI_ENABLE_READ_AFFINITY: "false"
# CRUSH location labels define which node labels to use
# as CRUSH location. This should correspond to the values set in
# the CRUSH map.
# Defaults to all the labels mentioned in
# https://rook.io/docs/rook/latest/CRDs/Cluster/ceph-cluster-crd/#osd-topology
# CSI_CRUSH_LOCATION_LABELS: "kubernetes.io/hostname,topology.kubernetes.io/zone,topology.rook.io/rack"
# Whether to skip any attach operation altogether for CephCSI PVCs.
# See more details [here](https://kubernetes-csi.github.io/docs/skip-attach.html#skip-attach-with-csi-driver-object).
# If set to false it skips the volume attachments and makes the creation of pods using the CephCSI PVC fast.
# **WARNING** It's highly discouraged to use this for RWO volumes. for RBD PVC it can cause data corruption,
# csi-addons operations like Reclaimspace and PVC Keyrotation will also not be supported if set to false
# since we'll have no VolumeAttachments to determine which node the PVC is mounted on.
# Refer to this [issue](https://github.com/kubernetes/kubernetes/issues/103305) for more details.
CSI_CEPHFS_ATTACH_REQUIRED: "true"
CSI_RBD_ATTACH_REQUIRED: "true"
CSI_NFS_ATTACH_REQUIRED: "true"
---
# OLM: BEGIN OPERATOR DEPLOYMENT
apiVersion: apps/v1
kind: Deployment
metadata:
name: rook-ceph-operator
namespace: rook-ceph # namespace:operator
labels:
operator: rook
storage-backend: ceph
app.kubernetes.io/name: rook-ceph
app.kubernetes.io/instance: rook-ceph
app.kubernetes.io/component: rook-ceph-operator
app.kubernetes.io/part-of: rook-ceph-operator
spec:
selector:
matchLabels:
app: rook-ceph-operator
strategy:
type: Recreate
replicas: 1
template:
metadata:
labels:
app: rook-ceph-operator
spec:
serviceAccountName: rook-ceph-system
containers:
- name: rook-ceph-operator
image: rook/ceph:master
args: ["ceph", "operator"]
securityContext:
runAsNonRoot: true
runAsUser: 2016
runAsGroup: 2016
volumeMounts:
- mountPath: /var/lib/rook
name: rook-config
- mountPath: /etc/ceph
name: default-config-dir
- mountPath: /etc/webhook
name: webhook-cert
ports:
- containerPort: 9443
name: https-webhook
protocol: TCP
env:
# If the operator should only watch for cluster CRDs in the same namespace, set this to "true".
# If this is not set to true, the operator will watch for cluster CRDs in all namespaces.
- name: ROOK_CURRENT_NAMESPACE_ONLY
value: "false"
# Rook Discover toleration. Will tolerate all taints with all keys.
# Choose between NoSchedule, PreferNoSchedule and NoExecute:
# - name: DISCOVER_TOLERATION
# value: "NoSchedule"
# (Optional) Rook Discover toleration key. Set this to the key of the taint you want to tolerate
# - name: DISCOVER_TOLERATION_KEY
# value: "<KeyOfTheTaintToTolerate>"
# (Optional) Rook Discover tolerations list. Put here list of taints you want to tolerate in YAML format.
# - name: DISCOVER_TOLERATIONS
# value: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) Rook Discover priority class name to set on the pod(s)
# - name: DISCOVER_PRIORITY_CLASS_NAME
# value: "<PriorityClassName>"
# (Optional) Discover Agent NodeAffinity.
# - name: DISCOVER_AGENT_NODE_AFFINITY
# value: "role=storage-node; storage=rook, ceph"
# (Optional) Discover Agent Pod Labels.
# - name: DISCOVER_AGENT_POD_LABELS
# value: "key1=value1,key2=value2"
# The duration between discovering devices in the rook-discover daemonset.
- name: ROOK_DISCOVER_DEVICES_INTERVAL
value: "60m"
# Whether to start pods as privileged that mount a host path, which includes the Ceph mon and osd pods.
# Set this to true if SELinux is enabled (e.g. OpenShift) to workaround the anyuid issues.
# For more details see https://github.com/rook/rook/issues/1314#issuecomment-355799641
- name: ROOK_HOSTPATH_REQUIRES_PRIVILEGED
value: "false"
# Disable automatic orchestration when new devices are discovered
- name: ROOK_DISABLE_DEVICE_HOTPLUG
value: "false"
# Provide customised regex as the values using comma. For eg. regex for rbd based volume, value will be like "(?i)rbd[0-9]+".
# In case of more than one regex, use comma to separate between them.
# Default regex will be "(?i)dm-[0-9]+,(?i)rbd[0-9]+,(?i)nbd[0-9]+"
# Add regex expression after putting a comma to blacklist a disk
# If value is empty, the default regex will be used.
- name: DISCOVER_DAEMON_UDEV_BLACKLIST
value: "(?i)dm-[0-9]+,(?i)rbd[0-9]+,(?i)nbd[0-9]+"
# - name: DISCOVER_DAEMON_RESOURCES
# value: |
# resources:
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Time to wait until the node controller will move Rook pods to other
# nodes after detecting an unreachable node.
# Pods affected by this setting are:
# mgr, rbd, mds, rgw, nfs, PVC based mons and osds, and ceph toolbox
# The value used in this variable replaces the default value of 300 secs
# added automatically by k8s as Toleration for
# <node.kubernetes.io/unreachable>
# The total amount of time to reschedule Rook pods in healthy nodes
# before detecting a <not ready node> condition will be the sum of:
# --> node-monitor-grace-period: 40 seconds (k8s kube-controller-manager flag)
# --> ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS: 5 seconds
- name: ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS
value: "5"
# The name of the node to pass with the downward API
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
# The pod name to pass with the downward API
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
# The pod namespace to pass with the downward API
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
# Recommended resource requests and limits, if desired
#resources:
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Uncomment it to run lib bucket provisioner in multithreaded mode
#- name: LIB_BUCKET_PROVISIONER_THREADS
# value: "5"
# Uncomment it to run rook operator on the host network
#hostNetwork: true
volumes:
- name: rook-config
emptyDir: {}
- name: default-config-dir
emptyDir: {}
- name: webhook-cert
emptyDir: {}
# OLM: END OPERATOR DEPLOYMENT

View File

@ -0,0 +1,690 @@
#################################################################################################################
# The deployment for the rook operator
# Contains the common settings for most Kubernetes deployments.
# For example, to create the rook-ceph cluster:
# kubectl create -f crds.yaml -f common.yaml -f operator.yaml
# kubectl create -f cluster.yaml
#
# Also see other operator sample files for variations of operator.yaml:
# - operator-openshift.yaml: Common settings for running in OpenShift
###############################################################################################################
# Rook Ceph Operator Config ConfigMap
# Use this ConfigMap to override Rook-Ceph Operator configurations.
# NOTE! Precedence will be given to this config if the same Env Var config also exists in the
# Operator Deployment.
# To move a configuration(s) from the Operator Deployment to this ConfigMap, add the config
# here. It is recommended to then remove it from the Deployment to eliminate any future confusion.
kind: ConfigMap
apiVersion: v1
metadata:
name: rook-ceph-operator-config
# should be in the namespace of the operator
namespace: rook-ceph # namespace:operator
data:
# The logging level for the operator: ERROR | WARNING | INFO | DEBUG
ROOK_LOG_LEVEL: "INFO"
# Allow using loop devices for osds in test clusters.
ROOK_CEPH_ALLOW_LOOP_DEVICES: "false"
# Enable the CSI driver.
# To run the non-default version of the CSI driver, see the override-able image properties in operator.yaml
ROOK_CSI_ENABLE_CEPHFS: "true"
# Enable the default version of the CSI RBD driver. To start another version of the CSI driver, see image properties below.
ROOK_CSI_ENABLE_RBD: "true"
# Enable the CSI NFS driver. To start another version of the CSI driver, see image properties below.
ROOK_CSI_ENABLE_NFS: "false"
ROOK_CSI_ENABLE_GRPC_METRICS: "false"
# Set to true to enable Ceph CSI pvc encryption support.
CSI_ENABLE_ENCRYPTION: "false"
# Set to true to enable host networking for CSI CephFS and RBD nodeplugins. This may be necessary
# in some network configurations where the SDN does not provide access to an external cluster or
# there is significant drop in read/write performance.
# CSI_ENABLE_HOST_NETWORK: "true"
# Set to true to enable adding volume metadata on the CephFS subvolume and RBD images.
# Not all users might be interested in getting volume/snapshot details as metadata on CephFS subvolume and RBD images.
# Hence enable metadata is false by default.
# CSI_ENABLE_METADATA: "true"
# cluster name identifier to set as metadata on the CephFS subvolume and RBD images. This will be useful in cases
# like for example, when two container orchestrator clusters (Kubernetes/OCP) are using a single ceph cluster.
# CSI_CLUSTER_NAME: "my-prod-cluster"
# Set logging level for cephCSI containers maintained by the cephCSI.
# Supported values from 0 to 5. 0 for general useful logs, 5 for trace level verbosity.
# CSI_LOG_LEVEL: "0"
# Set logging level for Kubernetes-csi sidecar containers.
# Supported values from 0 to 5. 0 for general useful logs (the default), 5 for trace level verbosity.
# CSI_SIDECAR_LOG_LEVEL: "0"
# Set replicas for csi provisioner deployment.
CSI_PROVISIONER_REPLICAS: "1"
# OMAP generator will generate the omap mapping between the PV name and the RBD image.
# CSI_ENABLE_OMAP_GENERATOR need to be enabled when we are using rbd mirroring feature.
# By default OMAP generator sidecar is deployed with CSI provisioner pod, to disable
# it set it to false.
# CSI_ENABLE_OMAP_GENERATOR: "false"
# set to false to disable deployment of snapshotter container in CephFS provisioner pod.
CSI_ENABLE_CEPHFS_SNAPSHOTTER: "true"
# set to false to disable deployment of snapshotter container in NFS provisioner pod.
CSI_ENABLE_NFS_SNAPSHOTTER: "true"
# set to false to disable deployment of snapshotter container in RBD provisioner pod.
CSI_ENABLE_RBD_SNAPSHOTTER: "true"
# Enable cephfs kernel driver instead of ceph-fuse.
# If you disable the kernel client, your application may be disrupted during upgrade.
# See the upgrade guide: https://rook.io/docs/rook/latest/ceph-upgrade.html
# NOTE! cephfs quota is not supported in kernel version < 4.17
CSI_FORCE_CEPHFS_KERNEL_CLIENT: "true"
# (Optional) policy for modifying a volume's ownership or permissions when the RBD PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_RBD_FSGROUPPOLICY: "File"
# (Optional) policy for modifying a volume's ownership or permissions when the CephFS PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_CEPHFS_FSGROUPPOLICY: "File"
# (Optional) policy for modifying a volume's ownership or permissions when the NFS PVC is being mounted.
# supported values are documented at https://kubernetes-csi.github.io/docs/support-fsgroup.html
CSI_NFS_FSGROUPPOLICY: "File"
# (Optional) Allow starting unsupported ceph-csi image
ROOK_CSI_ALLOW_UNSUPPORTED_VERSION: "false"
# (Optional) control the host mount of /etc/selinux for csi plugin pods.
CSI_PLUGIN_ENABLE_SELINUX_HOST_MOUNT: "false"
# The default version of CSI supported by Rook will be started. To change the version
# of the CSI driver to something other than what is officially supported, change
# these images to the desired release of the CSI driver.
# ROOK_CSI_CEPH_IMAGE: "quay.io/cephcsi/cephcsi:v3.9.0"
# ROOK_CSI_REGISTRAR_IMAGE: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.8.0"
# ROOK_CSI_RESIZER_IMAGE: "registry.k8s.io/sig-storage/csi-resizer:v1.8.0"
# ROOK_CSI_PROVISIONER_IMAGE: "registry.k8s.io/sig-storage/csi-provisioner:v3.5.0"
# ROOK_CSI_SNAPSHOTTER_IMAGE: "registry.k8s.io/sig-storage/csi-snapshotter:v6.2.2"
# ROOK_CSI_ATTACHER_IMAGE: "registry.k8s.io/sig-storage/csi-attacher:v4.3.0"
# To indicate the image pull policy to be applied to all the containers in the csi driver pods.
# ROOK_CSI_IMAGE_PULL_POLICY: "IfNotPresent"
# (Optional) set user created priorityclassName for csi plugin pods.
CSI_PLUGIN_PRIORITY_CLASSNAME: "system-node-critical"
# (Optional) set user created priorityclassName for csi provisioner pods.
CSI_PROVISIONER_PRIORITY_CLASSNAME: "system-cluster-critical"
# CSI CephFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# A maxUnavailable parameter of CSI cephFS plugin daemonset update strategy.
# Default value is 1.
# CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE: "1"
# CSI RBD plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_RBD_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# A maxUnavailable parameter of CSI RBD plugin daemonset update strategy.
# Default value is 1.
# CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE: "1"
# CSI NFS plugin daemonset update strategy, supported values are OnDelete and RollingUpdate.
# Default value is RollingUpdate.
# CSI_NFS_PLUGIN_UPDATE_STRATEGY: "OnDelete"
# kubelet directory path, if kubelet configured to use other than /var/lib/kubelet path.
# ROOK_CSI_KUBELET_DIR_PATH: "/var/lib/kubelet"
# Labels to add to the CSI CephFS Deployments and DaemonSets Pods.
# ROOK_CSI_CEPHFS_POD_LABELS: "key1=value1,key2=value2"
# Labels to add to the CSI RBD Deployments and DaemonSets Pods.
# ROOK_CSI_RBD_POD_LABELS: "key1=value1,key2=value2"
# Labels to add to the CSI NFS Deployments and DaemonSets Pods.
# ROOK_CSI_NFS_POD_LABELS: "key1=value1,key2=value2"
# (Optional) CephCSI CephFS plugin Volumes
# CSI_CEPHFS_PLUGIN_VOLUME: |
# - name: lib-modules
# hostPath:
# path: /run/current-system/kernel-modules/lib/modules/
# - name: host-nix
# hostPath:
# path: /nix
# (Optional) CephCSI CephFS plugin Volume mounts
# CSI_CEPHFS_PLUGIN_VOLUME_MOUNT: |
# - name: host-nix
# mountPath: /nix
# readOnly: true
# (Optional) CephCSI RBD plugin Volumes
# CSI_RBD_PLUGIN_VOLUME: |
# - name: lib-modules
# hostPath:
# path: /run/current-system/kernel-modules/lib/modules/
# - name: host-nix
# hostPath:
# path: /nix
# (Optional) CephCSI RBD plugin Volume mounts
# CSI_RBD_PLUGIN_VOLUME_MOUNT: |
# - name: host-nix
# mountPath: /nix
# readOnly: true
# (Optional) CephCSI provisioner NodeAffinity (applied to both CephFS and RBD provisioner).
# CSI_PROVISIONER_NODE_AFFINITY: "role=storage-node; storage=rook, ceph"
# (Optional) CephCSI provisioner tolerations list(applied to both CephFS and RBD provisioner).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_PROVISIONER_TOLERATIONS: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) CephCSI plugin NodeAffinity (applied to both CephFS and RBD plugin).
# CSI_PLUGIN_NODE_AFFINITY: "role=storage-node; storage=rook, ceph"
# (Optional) CephCSI plugin tolerations list(applied to both CephFS and RBD plugin).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_PLUGIN_TOLERATIONS: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) CephCSI RBD provisioner NodeAffinity (if specified, overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_RBD_PROVISIONER_NODE_AFFINITY: "role=rbd-node"
# (Optional) CephCSI RBD provisioner tolerations list(if specified, overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_RBD_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/rbd
# operator: Exists
# (Optional) CephCSI RBD plugin NodeAffinity (if specified, overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_RBD_PLUGIN_NODE_AFFINITY: "role=rbd-node"
# (Optional) CephCSI RBD plugin tolerations list(if specified, overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_RBD_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/rbd
# operator: Exists
# (Optional) CephCSI CephFS provisioner NodeAffinity (if specified, overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_CEPHFS_PROVISIONER_NODE_AFFINITY: "role=cephfs-node"
# (Optional) CephCSI CephFS provisioner tolerations list(if specified, overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_CEPHFS_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/cephfs
# operator: Exists
# (Optional) CephCSI CephFS plugin NodeAffinity (if specified, overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_CEPHFS_PLUGIN_NODE_AFFINITY: "role=cephfs-node"
# NOTE: Support for defining NodeAffinity for operators other than "In" and "Exists" requires the user to input a
# valid v1.NodeAffinity JSON or YAML string. For example, the following is valid YAML v1.NodeAffinity:
# CSI_CEPHFS_PLUGIN_NODE_AFFINITY: |
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: myKey
# operator: DoesNotExist
# (Optional) CephCSI CephFS plugin tolerations list(if specified, overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_CEPHFS_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/cephfs
# operator: Exists
# (Optional) CephCSI NFS provisioner NodeAffinity (overrides CSI_PROVISIONER_NODE_AFFINITY).
# CSI_NFS_PROVISIONER_NODE_AFFINITY: "role=nfs-node"
# (Optional) CephCSI NFS provisioner tolerations list (overrides CSI_PROVISIONER_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI provisioner would be best to start on the same nodes as other ceph daemons.
# CSI_NFS_PROVISIONER_TOLERATIONS: |
# - key: node.rook.io/nfs
# operator: Exists
# (Optional) CephCSI NFS plugin NodeAffinity (overrides CSI_PLUGIN_NODE_AFFINITY).
# CSI_NFS_PLUGIN_NODE_AFFINITY: "role=nfs-node"
# (Optional) CephCSI NFS plugin tolerations list (overrides CSI_PLUGIN_TOLERATIONS).
# Put here list of taints you want to tolerate in YAML format.
# CSI plugins need to be started on all the nodes where the clients need to mount the storage.
# CSI_NFS_PLUGIN_TOLERATIONS: |
# - key: node.rook.io/nfs
# operator: Exists
# (Optional) CEPH CSI RBD provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
#CSI_RBD_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-resizer
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-snapshotter
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-rbdplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : csi-omap-generator
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI RBD plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
#CSI_RBD_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-rbdplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI CephFS provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
#CSI_CEPHFS_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-resizer
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-snapshotter
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-cephfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI CephFS plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
#CSI_CEPHFS_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-cephfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : liveness-prometheus
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# (Optional) CEPH CSI NFS provisioner resource requirement list, Put here list of resource
# requests and limits you want to apply for provisioner pod
# CSI_NFS_PROVISIONER_RESOURCE: |
# - name : csi-provisioner
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# - name : csi-nfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# - name : csi-attacher
# resource:
# requests:
# memory: 128Mi
# cpu: 100m
# limits:
# memory: 256Mi
# cpu: 200m
# (Optional) CEPH CSI NFS plugin resource requirement list, Put here list of resource
# requests and limits you want to apply for plugin pod
# CSI_NFS_PLUGIN_RESOURCE: |
# - name : driver-registrar
# resource:
# requests:
# memory: 128Mi
# cpu: 50m
# limits:
# memory: 256Mi
# cpu: 100m
# - name : csi-nfsplugin
# resource:
# requests:
# memory: 512Mi
# cpu: 250m
# limits:
# memory: 1Gi
# cpu: 500m
# Configure CSI Ceph FS grpc and liveness metrics port
# Set to true to enable Ceph CSI liveness container.
CSI_ENABLE_LIVENESS: "false"
# CSI_CEPHFS_GRPC_METRICS_PORT: "9091"
# CSI_CEPHFS_LIVENESS_METRICS_PORT: "9081"
# Configure CSI RBD grpc and liveness metrics port
# CSI_RBD_GRPC_METRICS_PORT: "9090"
# CSI_RBD_LIVENESS_METRICS_PORT: "9080"
# CSIADDONS_PORT: "9070"
# Set CephFS Kernel mount options to use https://docs.ceph.com/en/latest/man/8/mount.ceph/#options
# Set to "ms_mode=secure" when connections.encrypted is enabled in CephCluster CR
# CSI_CEPHFS_KERNEL_MOUNT_OPTIONS: "ms_mode=secure"
# Whether the OBC provisioner should watch on the operator namespace or not, if not the namespace of the cluster will be used
ROOK_OBC_WATCH_OPERATOR_NAMESPACE: "true"
# Whether to start the discovery daemon to watch for raw storage devices on nodes in the cluster.
# This daemon does not need to run if you are only going to create your OSDs based on StorageClassDeviceSets with PVCs.
ROOK_ENABLE_DISCOVERY_DAEMON: "false"
# The timeout value (in seconds) of Ceph commands. It should be >= 1. If this variable is not set or is an invalid value, it's default to 15.
ROOK_CEPH_COMMANDS_TIMEOUT_SECONDS: "15"
# Enable the csi addons sidecar.
CSI_ENABLE_CSIADDONS: "false"
# Enable watch for faster recovery from rbd rwo node loss
ROOK_WATCH_FOR_NODE_FAILURE: "true"
# ROOK_CSIADDONS_IMAGE: "quay.io/csiaddons/k8s-sidecar:v0.7.0"
# The CSI GRPC timeout value (in seconds). It should be >= 120. If this variable is not set or is an invalid value, it's default to 150.
CSI_GRPC_TIMEOUT_SECONDS: "150"
ROOK_DISABLE_ADMISSION_CONTROLLER: "true"
# Enable topology based provisioning.
CSI_ENABLE_TOPOLOGY: "false"
# Domain labels define which node labels to use as domains
# for CSI nodeplugins to advertise their domains
# NOTE: the value here serves as an example and needs to be
# updated with node labels that define domains of interest
# CSI_TOPOLOGY_DOMAIN_LABELS: "kubernetes.io/hostname,topology.kubernetes.io/zone,topology.rook.io/rack"
# Enable read affinity for RBD volumes. Recommended to
# set to true if running kernel 5.8 or newer.
CSI_ENABLE_READ_AFFINITY: "false"
# CRUSH location labels define which node labels to use
# as CRUSH location. This should correspond to the values set in
# the CRUSH map.
# Defaults to all the labels mentioned in
# https://rook.io/docs/rook/latest/CRDs/Cluster/ceph-cluster-crd/#osd-topology
# CSI_CRUSH_LOCATION_LABELS: "kubernetes.io/hostname,topology.kubernetes.io/zone,topology.rook.io/rack"
# Whether to skip any attach operation altogether for CephCSI PVCs.
# See more details [here](https://kubernetes-csi.github.io/docs/skip-attach.html#skip-attach-with-csi-driver-object).
# If set to false it skips the volume attachments and makes the creation of pods using the CephCSI PVC fast.
# **WARNING** It's highly discouraged to use this for RWO volumes. for RBD PVC it can cause data corruption,
# csi-addons operations like Reclaimspace and PVC Keyrotation will also not be supported if set to false
# since we'll have no VolumeAttachments to determine which node the PVC is mounted on.
# Refer to this [issue](https://github.com/kubernetes/kubernetes/issues/103305) for more details.
CSI_CEPHFS_ATTACH_REQUIRED: "true"
CSI_RBD_ATTACH_REQUIRED: "true"
CSI_NFS_ATTACH_REQUIRED: "true"
# Rook Discover toleration. Will tolerate all taints with all keys.
# (Optional) Rook Discover tolerations list. Put here list of taints you want to tolerate in YAML format.
# DISCOVER_TOLERATIONS: |
# - effect: NoSchedule
# key: node-role.kubernetes.io/control-plane
# operator: Exists
# - effect: NoExecute
# key: node-role.kubernetes.io/etcd
# operator: Exists
# (Optional) Rook Discover priority class name to set on the pod(s)
# DISCOVER_PRIORITY_CLASS_NAME: "<PriorityClassName>"
# (Optional) Discover Agent NodeAffinity.
# DISCOVER_AGENT_NODE_AFFINITY: |
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: myKey
# operator: DoesNotExist
# (Optional) Discover Agent Pod Labels.
# DISCOVER_AGENT_POD_LABELS: "key1=value1,key2=value2"
# Disable automatic orchestration when new devices are discovered
ROOK_DISABLE_DEVICE_HOTPLUG: "false"
# The duration between discovering devices in the rook-discover daemonset.
ROOK_DISCOVER_DEVICES_INTERVAL: "60m"
# DISCOVER_DAEMON_RESOURCES: |
# - name: DISCOVER_DAEMON_RESOURCES
# resources:
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 100m
# memory: 128Mi
---
# OLM: BEGIN OPERATOR DEPLOYMENT
apiVersion: apps/v1
kind: Deployment
metadata:
name: rook-ceph-operator
namespace: rook-ceph # namespace:operator
labels:
operator: rook
storage-backend: ceph
app.kubernetes.io/name: rook-ceph
app.kubernetes.io/instance: rook-ceph
app.kubernetes.io/component: rook-ceph-operator
app.kubernetes.io/part-of: rook-ceph-operator
spec:
selector:
matchLabels:
app: rook-ceph-operator
strategy:
type: Recreate
replicas: 1
template:
metadata:
labels:
app: rook-ceph-operator
spec:
serviceAccountName: rook-ceph-system
containers:
- name: rook-ceph-operator
image: rook/ceph:master
args: ["ceph", "operator"]
securityContext:
runAsNonRoot: true
runAsUser: 2016
runAsGroup: 2016
capabilities:
drop: ["ALL"]
volumeMounts:
- mountPath: /var/lib/rook
name: rook-config
- mountPath: /etc/ceph
name: default-config-dir
- mountPath: /etc/webhook
name: webhook-cert
ports:
- containerPort: 9443
name: https-webhook
protocol: TCP
env:
# If the operator should only watch for cluster CRDs in the same namespace, set this to "true".
# If this is not set to true, the operator will watch for cluster CRDs in all namespaces.
- name: ROOK_CURRENT_NAMESPACE_ONLY
value: "false"
# Whether to start pods as privileged that mount a host path, which includes the Ceph mon and osd pods.
# Set this to true if SELinux is enabled (e.g. OpenShift) to workaround the anyuid issues.
# For more details see https://github.com/rook/rook/issues/1314#issuecomment-355799641
- name: ROOK_HOSTPATH_REQUIRES_PRIVILEGED
value: "false"
# Provide customised regex as the values using comma. For eg. regex for rbd based volume, value will be like "(?i)rbd[0-9]+".
# In case of more than one regex, use comma to separate between them.
# Default regex will be "(?i)dm-[0-9]+,(?i)rbd[0-9]+,(?i)nbd[0-9]+"
# Add regex expression after putting a comma to blacklist a disk
# If value is empty, the default regex will be used.
- name: DISCOVER_DAEMON_UDEV_BLACKLIST
value: "(?i)dm-[0-9]+,(?i)rbd[0-9]+,(?i)nbd[0-9]+"
# Time to wait until the node controller will move Rook pods to other
# nodes after detecting an unreachable node.
# Pods affected by this setting are:
# mgr, rbd, mds, rgw, nfs, PVC based mons and osds, and ceph toolbox
# The value used in this variable replaces the default value of 300 secs
# added automatically by k8s as Toleration for
# <node.kubernetes.io/unreachable>
# The total amount of time to reschedule Rook pods in healthy nodes
# before detecting a <not ready node> condition will be the sum of:
# --> node-monitor-grace-period: 40 seconds (k8s kube-controller-manager flag)
# --> ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS: 5 seconds
- name: ROOK_UNREACHABLE_NODE_TOLERATION_SECONDS
value: "5"
# The name of the node to pass with the downward API
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
# The pod name to pass with the downward API
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
# The pod namespace to pass with the downward API
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
# Recommended resource requests and limits, if desired
#resources:
# limits:
# cpu: 500m
# memory: 512Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Uncomment it to run lib bucket provisioner in multithreaded mode
#- name: LIB_BUCKET_PROVISIONER_THREADS
# value: "5"
# Uncomment it to run rook operator on the host network
#hostNetwork: true
volumes:
- name: rook-config
emptyDir: {}
- name: default-config-dir
emptyDir: {}
- name: webhook-cert
emptyDir: {}
# OLM: END OPERATOR DEPLOYMENT

View File

@ -0,0 +1,2 @@
nameserver {{ ns1 }}
nameserver {{ ns2 }}

30
vars/default.yml Normal file
View File

@ -0,0 +1,30 @@
######################################
# Global vars for Cloud-Cluster #
######################################
# Name of the envrionment where to deploy to
# - 'production' (for production environment)
# - 'staging' (for staging environment)
# - 'development' (for development environment)
# user can expand sub-domains as the wish
# masterDomain: 'example.com'
masterDomain: masasana.ai
deployEnvironment: production
subdomains:
staging: stg
development: dev
# ToDo: Check if microk8s use different classes
# TODO: find usages
k8s_ingress_class: nginx
# Identity Management
idm_domain: auth.{{ domain }}
idmDomain: "{{ idm_domain }}"
############################################################################
# computed varibles -> please don't modify by hand! #
############################################################################
domain: '{% if deployEnvironment != "production" %}{{ subdomains[deployEnvironment] }}.{% endif %}{{ masterDomain }}'
kubernetesApi: '{{ hostvars[groups["kubernetes_api"][0]].ansible_host }}'

View File

@ -0,0 +1,11 @@
######################################
# cert-manager vars for cluster #
######################################
namespace: "cert-manager"
email: wefers@masasana.ai
helm:
repoUrl: "https://charts.jetstack.io"
chart: "cert-manager/cert-manager"
releaseName: "cert-manager"
chartVersion: "1.14.4"

View File

@ -0,0 +1,25 @@
######################################
# Global vars for Cloud-Cluster #
######################################
# Debug mode for test_task.yml
debug: False
# Cluster name
kubernetesClusterName: 'marcel-stg'
# Cluster enviroment
kubernetesVersion: '1.29.3'
# Cluster Type. Possible types:
# - vanilla # plain kubernetes (HA)
kubernetesClusterType: 'vanilla'
# decide if kube-vip should be installed?
installKubeVip: False

View File

@ -0,0 +1,6 @@
######################################
# helm3 vars for cluster #
######################################
# Helm install script path
helm_install_script: 'https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3'

View File

@ -0,0 +1,11 @@
######################################
# Kubernetes ingress controller vars #
######################################
namespace: 'ingress-nginx'
helm:
repoUrl: 'https://kubernetes.github.io/ingress-nginx'
chart: 'ingress-nginx/ingress-nginx'
releaseName: 'ingress-nginx'
chartVersion: '4.10.0'

View File

@ -0,0 +1,9 @@
######################################
# kube-vip vars for virtual IP #
######################################
# KubeVIP virtual IP settings
virtual_ip: '10.42.0.100'
interface: 'enp7s0'
kube_vip_version: 'v0.5.0'

View File

@ -0,0 +1,8 @@
######################################
# System vars for Linux Systems #
######################################
# CRI-O container engine + version
crio_version: '1.28'
calico_main_interface: 'interface=enp7s0'

View File

@ -0,0 +1,12 @@
######################################
# MetalLB Configuration #
######################################
namespace: "metallb"
helm:
repoUrl: "https://metallb.github.io/metallb"
chart: "metallb/metallb"
releaseName: "metallb"
chartVersion: "v0.14.4"

View File

@ -0,0 +1,34 @@
######################################
# rook-ceph vars for storage #
######################################
namespace: 'rook-ceph'
CLIENT_CHECKER_NAME: 'client.healthchecker'
RGW_POOL_PREFIX: 'default'
rook_external:
csi_rbd_provisioner_secret: ""
csi_rbd_node_secret: ""
# possible types:
# - dev (2 mons/ 4 osds - cloud be changed in the rook cluster-test.yml) # ToDO: set variables
# - prod (requires a minimum set of 3 mons)
rook_cluster_type: 'dev'
# prod an dev variables are fixed. Please do not change the name
rook_cluster_configs:
dev:
name: 'my-cluster'
mons: 1
osds: 3
mgrs: 1
prod:
name: 'rook-ceph'
mons: 3
mgrs: 2
############################################################################
# computed varibles -> please don't modify by hand! #
############################################################################
rook_cluster_config: "{{ rook_cluster_configs[rook_cluster_type] }}"

View File

@ -0,0 +1,19 @@
######################################
# System vars for Linux Systems #
######################################
# ToDo: Linux base hardening
# Nameserver IPv4 Addresses
# /etc/resolv.conf
#ns1: '8.8.8.8'
#ns2: '4.4.4.4'
# package versions
k8s_pip_version: '25.3.0'
# Necessary System packages we need to intall
# ToDo: Pin packages to fix version; test split string
# ToDo: descripe version split
sys_packages: [ 'curl', 'nano', 'python3', 'python3-pip', 'htop', 'lsb-release', 'git' ]
k8s_sys_packages: [ 'open-iscsi', 'apt-transport-https', 'ca-certificates', 'gnupg' ]
pip_packages: ['PyYAML', 'jmespath', 'kubernetes>={{ k8s_pip_version }},<{{ (k8s_pip_version | string).split(".")[0] | int + 1 }}']