基于ceph-fs的动态部署


k8s中的ceph-fs的动态存储

环境说明

hostname ip os roles
liran-test-1 10.0.7.19 centos7 mon, mgr, mds, osd
liran-test-2 10.0.7.28 centos7 mon, osd
liran-test-3 10.0.7.15 centos7 mon, osd
k8s-ceshi-1 10.0.7.26 centos7 master
k8s-ceshi-2 10.0.7.34 centos7 master
k8s-ceshi-3 10.0.7.44 centos7 master
k8s-ceshi-4 10.0.7.13 centos7 node
k8s-ceshi-5 10.0.7.17 centos7 node

cephfs的provisioner安装

https://github.com/kubernetes-incubator/external-storage/tree/master/ceph/cephfs

创建ns

[root@k8s-ceshi-01.novalocal 16:08 ~/k8s/ceph/k8s-cephfs]
# cat cephfs-ns.yaml 
apiVersion: v1
kind: Namespace
metadata:
  name: cephfs
  labels:
   name: cephfs
[root@k8s-ceshi-01.novalocal 16:12 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f cephfs-ns.yaml 
namespace/cephfs created

创建secret

从ceph中获取admin的secret

[root@liran-test-1.novalocal 16:15 ~/cluster]
# echo "AQAzwo1dfNI2GhAAcDsTfLmMKUQr4grR0tysDw==" | base64
QVFBendvMWRmTkkyR2hBQWNEc1RmTG1NS1VRcjRnclIwdHlzRHc9PQo=

创建ceph的admin的secret到k8s中

# cat rbd/deploy/rbac/ceph_test_secret.yaml 
apiVersion: v1
kind: Secret
metadata:
  name: cephfs-admin-secret
  namespace: kube-system
type: "kubernetes.io/rbd"
data:
  key: QVFBendvMWRmTkkyR2hBQWNEc1RmTG1NS1VRcjRnclIwdHlzRHc9PQo=
[root@k8s-ceshi-01.novalocal 16:57 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f ceph-secret.yaml 
secret/cephfs-admin-secret created

创建服务账号

# cat serviceaccount.yaml 
apiVersion: v1
kind: ServiceAccount
metadata:
  name: cephfs-provisioner
  namespace: cephfs
# kubectl apply -f serviceaccount.yaml 
serviceaccount/cephfs-provisioner created

创建角色

# cat ceph-role.yaml 
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: cephfs-provisioner
  namespace: cephfs
rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["create", "get", "delete"]
  - apiGroups: [""]
    resources: ["endpoints"]
    verbs: ["get", "list", "watch", "create", "update", "patch"]
# kubectl apply -f ceph-role.yaml 
role.rbac.authorization.k8s.io/cephfs-provisioner created

创建集群角色

# cat clusterrole.yaml 
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: cephfs-provisioner
  namespace: cephfs
rules:
  - apiGroups: [""]
    resources: ["persistentvolumes"]
    verbs: ["get", "list", "watch", "create", "delete"]
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "list", "watch", "update"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create", "update", "patch"]
  - apiGroups: [""]
    resources: ["services"]
    resourceNames: ["kube-dns","coredns"]
    verbs: ["list", "get"]
# kubectl apply -f clusterrole.yaml 
clusterrole.rbac.authorization.k8s.io/cephfs-provisioner created

绑定角色

[root@k8s-ceshi-01.novalocal 17:13 ~/k8s/ceph/k8s-cephfs]
# cat rolebinding.yaml 
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: cephfs-provisioner
  namespace: cephfs
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: cephfs-provisioner
subjects:
- kind: ServiceAccount
  name: cephfs-provisioner

[root@k8s-ceshi-01.novalocal 17:13 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f rolebinding.yaml 
rolebinding.rbac.authorization.k8s.io/cephfs-provisioner created

绑定集群角色

[root@k8s-ceshi-01.novalocal 17:14 ~/k8s/ceph/k8s-cephfs]
# cat clusterrolebingding.yaml 
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: cephfs-provisioner
subjects:
  - kind: ServiceAccount
    name: cephfs-provisioner
    namespace: cephfs
roleRef:
  kind: ClusterRole
  name: cephfs-provisioner
  apiGroup: rbac.authorization.k8s.io

[root@k8s-ceshi-01.novalocal 17:14 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f clusterrolebingding.yaml 
clusterrolebinding.rbac.authorization.k8s.io/cephfs-provisioner created

部署cephfs-provisoner

# cat deployment.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: cephfs-provisioner
  namespace: cephfs
spec:
  replicas: 1
  strategy:
    type: Recreate
  template:
    metadata:
      labels:
        app: cephfs-provisioner
    spec:
      containers:
      - name: cephfs-provisioner
        image: "harbor-ceshi.sreblog.com/k8s/cephfs-provisioner:v2.1.0-k8s1.11"
        env:
        - name: PROVISIONER_NAME
          value: ceph.com/cephfs
        - name: PROVISIONER_SECRET_NAMESPACE
          value: cephfs
        command:
        - "/usr/local/bin/cephfs-provisioner"
        args:
        - "-id=cephfs-provisioner-1"
        - "-disable-ceph-namespace-isolation=true"
      serviceAccount: cephfs-provisioner
[root@k8s-ceshi-01.novalocal 17:19 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f deployment.yaml                               
deployment.extensions/cephfs-provisioner created

[root@k8s-ceshi-01.novalocal 17:19 ~/k8s/ceph/k8s-cephfs]
# kubectl get pod -n cephfs
NAME                                  READY   STATUS    RESTARTS   AGE
cephfs-provisioner-6568c77b4b-246nv   1/1     Running   0          13s

创建存储类

# cat cephfs-storageclass.yaml 
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
  name: cephfs
  namespace: cephfs
provisioner: ceph.com/cephfs
reclaimPolicy: Delete
parameters:
    monitors: 10.0.7.19:6789
    adminId: admin
    adminSecretName: cephfs-admin-secret
    adminSecretNamespace: "cephfs"
    claimRoot: /volumes/kubernetes
[root@k8s-ceshi-01.novalocal 17:22 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f cephfs-storageclass.yaml 
storageclass.storage.k8s.io/cephfs created

[root@k8s-ceshi-01.novalocal 17:22 ~/k8s/ceph/k8s-cephfs]
# kubectl get sc
NAME       PROVISIONER       AGE
ceph-rbd   ceph.com/rbd      5d
cephfs     ceph.com/cephfs   4s

测试

创建一个pvc

# cat test-pvc.yaml 
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: claim1
spec:
  storageClassName: cephfs
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 1Gi
# kubectl apply -f test-pvc.yaml

创建pod

# cat pod-test.yaml 
kind: Pod
apiVersion: v1
metadata:
  name: pod-test2
spec:
  containers:
  - name: test-pod2
    image: centos
    command:
      - "/bin/sh"
    args:
      - "-c"
      - "ping 127.0.0.1"
    volumeMounts:
      - name: pvc
        mountPath: "/mnt"
  restartPolicy: "Always"
  volumes:
    - name: pvc
      persistentVolumeClaim:
        claimName: claim1

[root@k8s-ceshi-01.novalocal 17:25 ~/k8s/ceph/k8s-cephfs]
# kubectl apply -f pod-test.yaml 
pod/pod-test2 created

[root@k8s-ceshi-01.novalocal 17:25 ~/k8s/ceph/k8s-cephfs]
# kubectl get pod
NAME                                 READY   STATUS              RESTARTS   AGE
ceph-rbd-pvc-test-59d4869cfb-jkg6q   1/1     Running             0          3d7h
pod-test2                            0/1     ContainerCreating   0          4s
rbd-provisioner-77cc657d45-l6hfw     1/1     Running             98         66d

[root@k8s-ceshi-01.novalocal 17:26 ~/k8s/ceph/k8s-cephfs]
# kubectl get pod
NAME                                 READY   STATUS    RESTARTS   AGE
ceph-rbd-pvc-test-59d4869cfb-jkg6q   1/1     Running   0          3d7h
pod-test2                            1/1     Running   0          67s
rbd-provisioner-77cc657d45-l6hfw     1/1     Running   98         66d

[root@k8s-ceshi-01.novalocal 17:26 ~/k8s/ceph/k8s-cephfs]
# kubectl exec -it pod-test2 /bin/bash
[root@pod-test2 /]# df -hT
Filesystem                                                                                                Type     Size  Used Avail Use% Mounted on
overlay                                                                                                   overlay   40G   32G  8.8G  79% /
tmpfs                                                                                                     tmpfs     64M     0   64M   0% /dev
tmpfs                                                                                                     tmpfs    5.8G     0  5.8G   0% /sys/fs/cgroup
10.0.7.19:6789:/volumes/kubernetes/kubernetes/kubernetes-dynamic-pvc-01c1e82e-e29b-11e9-ba7e-3278bf47aa4d ceph      28G     0   28G   0% /mnt
/dev/vda1                                                                                                 xfs       40G   32G  8.8G  79% /etc/hosts
shm                                                                                                       tmpfs     64M     0   64M   0% /dev/shm
tmpfs                                                                                                     tmpfs    5.8G   12K  5.8G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs                                                                                                     tmpfs    5.8G     0  5.8G   0% /proc/acpi
tmpfs                                                                                                     tmpfs    5.8G     0  5.8G   0% /proc/scsi
tmpfs                                                                                                     tmpfs    5.8G     0  5.8G   0% /sys/firmware
[root@pod-test2 /]# cd /mnt
[root@pod-test2 mnt]# ls
[root@pod-test2 mnt]# exit

可以看到pod中正常挂载了ceph的fs中的目录。

直接挂载模式

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: test
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: test
    spec:
      containers:
      - name: test
        image: dk-reg.op.douyuyuba.com/op-base/openresty:1.9.15
        ports:
        - containerPort: 80
        volumeMounts:
          - mountPath: "/data"
            name: data
      volumes:
        - name: data
          cephfs:
            monitors:
              - 10.5.10.117:6789
              - 10.5.10.236:6789
              - 10.5.10.227:6789
            path: /data
            user: admin
            secretRef:
              name: ceph-secret

故障排除

input/output i/o error

在挂载了ceph的容器中,查看ceph正常挂载了,然后在挂载目录中操作报错

[root@pod-test2 mnt]# echo "afdgf" >> abc
bash: abc: Input/output error

解决

方法一:升级内核到4.4以上
方法二:在deployment中添加参数 "-disable-ceph-namespace-isolation=true",禁止使用ceph的namespace功能
添加完的deployment为
# cat deployment.yaml 
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: cephfs-provisioner
  namespace: cephfs
spec:
  replicas: 1
  strategy:
    type: Recreate
  template:
    metadata:
      labels:
        app: cephfs-provisioner
    spec:
      containers:
      - name: cephfs-provisioner
        image: "harbor-ceshi.sreblog.com/k8s/cephfs-provisioner:v2.1.0-k8s1.11"
        env:
        - name: PROVISIONER_NAME
          value: ceph.com/cephfs
        - name: PROVISIONER_SECRET_NAMESPACE
          value: cephfs
        command:
        - "/usr/local/bin/cephfs-provisioner"
        args:
        - "-id=cephfs-provisioner-1"
        - "-disable-ceph-namespace-isolation=true"
      serviceAccount: cephfs-provisioner

扩容说明

ceph的扩容目前只针对fuse的模式或者内核版本大于4.4的linux系统。

扩容采用设置目录的配额来实现

setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir     # 100 MB
setfattr -n ceph.quota.max_files -v 10000 /some/dir         # 10,000 files

源码分析

https://github.com/kubernetes-incubator/external-storage/tree/master/ceph/cephfs/cephfs_provisioner

cephfs主要通过2个文件来实现ceph-fs的挂载

  • cephfs-provisoiner.go这个是provisoiner入口文件
  • cephfs_provisioner/cephfs_provisioner.py。实际的执行文件

cephfs-provisoiner.go

cephfs-provisoiner.go中的provisoin分析

  • 解析参数
    func (p *cephFSProvisioner) Provision(options controller.VolumeOptions) (*v1.PersistentVolume, error) {
      if options.PVC.Spec.Selector != nil {
          return nil, fmt.Errorf("claim Selector is not supported")
      }
      cluster, adminID, adminSecret, pvcRoot, mon, deterministicNames, err := p.parseParameters(options.Parameters)
      if err != nil {
          return nil, err
      }
  • 判断名字
    var share, user string
      if deterministicNames {
          share = options.PVC.Name
          user = fmt.Sprintf("k8s.%s.%s", options.PVC.Namespace, options.PVC.Name)
      } else {
          // create random share name
          share = fmt.Sprintf("kubernetes-dynamic-pvc-%s", uuid.NewUUID())
          // create random user id
          user = fmt.Sprintf("kubernetes-dynamic-user-%s", uuid.NewUUID())
      }
  • 创建命令
    生成cmd命令,然后嗲用python脚本

    // provision share
      // create cmd
      args := []string{"-n", share, "-u", user}
      if p.enableQuota {
          capacity := options.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
          requestBytes := strconv.FormatInt(capacity.Value(), 10)
          args = append(args, "-s", requestBytes)
      }
      cmd := exec.Command(provisionCmd, args...)
      // set env
      cmd.Env = []string{
          "CEPH_CLUSTER_NAME=" + cluster,
          "CEPH_MON=" + strings.Join(mon[:], ","),
          "CEPH_AUTH_ID=" + adminID,
          "CEPH_AUTH_KEY=" + adminSecret,
          "CEPH_VOLUME_ROOT=" + pvcRoot}
      if deterministicNames {
          cmd.Env = append(cmd.Env, "CEPH_VOLUME_GROUP="+options.PVC.Namespace)
      }
      if *disableCephNamespaceIsolation {
          cmd.Env = append(cmd.Env, "CEPH_NAMESPACE_ISOLATION_DISABLED=true")
      }
    
      output, cmdErr := cmd.CombinedOutput()
      if cmdErr != nil {
          klog.Errorf("failed to provision share %q for %q, err: %v, output: %v", share, user, cmdErr, string(output))
          return nil, cmdErr
      }
      // validate output
      res := &provisionOutput{}
      json.Unmarshal([]byte(output), &res)
      if res.User == "" || res.Secret == "" || res.Path == "" {
          return nil, fmt.Errorf("invalid provisioner output")
      }
  • 设置pv属性

    nameSpace := p.secretNamespace
      if nameSpace == "" {
          // if empty, create secret in PVC's namespace
          nameSpace = options.PVC.Namespace
      }
      secretName := generateSecretName(user)
      secret := &v1.Secret{
          ObjectMeta: metav1.ObjectMeta{
              Namespace: nameSpace,
              Name:      secretName,
          },
          Data: map[string][]byte{
              "key": []byte(res.Secret),
          },
          Type: "Opaque",
      }
    
      _, err = p.client.CoreV1().Secrets(nameSpace).Create(secret)
      if err != nil && !apierrors.IsAlreadyExists(err) {
          klog.Errorf("Cephfs Provisioner: create volume failed, err: %v", err)
          return nil, fmt.Errorf("failed to create secret")
      }
    
      pv := &v1.PersistentVolume{
          ObjectMeta: metav1.ObjectMeta{
              Name: options.PVName,
              Annotations: map[string]string{
                  provisionerIDAnn: p.identity,
                  cephShareAnn:     share,
              },
          },
          Spec: v1.PersistentVolumeSpec{
              PersistentVolumeReclaimPolicy: options.PersistentVolumeReclaimPolicy,
              AccessModes:                   options.PVC.Spec.AccessModes,
              MountOptions:                  options.MountOptions,
              Capacity: v1.ResourceList{
                  // Quotas are supported by the userspace client(ceph-fuse, libcephfs), or kernel client >= 4.17 but only on mimic clusters.
                  // In other cases capacity is meaningless here.
                  // If quota is enabled, provisioner will set ceph.quota.max_bytes on volume path.
                  v1.ResourceName(v1.ResourceStorage): options.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)],
              },
              PersistentVolumeSource: v1.PersistentVolumeSource{
                  CephFS: &v1.CephFSPersistentVolumeSource{
                      Monitors: mon,
                      Path:     res.Path[strings.Index(res.Path, "/"):],
                      SecretRef: &v1.SecretReference{
                          Name:      secretName,
                          Namespace: nameSpace,
                      },
                      User: user,
                  },
              },
          },
      }
    
      klog.Infof("successfully created CephFS share %+v", pv.Spec.PersistentVolumeSource.CephFS)
    
      return pv, nil
    }

cephfs_provisioner.py

main函数

def main():
    create = True
    share = ""
    user = ""
    size = None
    cephfs = CephFSNativeDriver()
    try:
        opts, args = getopt.getopt(sys.argv[1:], "rn:u:s:", ["remove"])
    except getopt.GetoptError:
        usage()
        sys.exit(1)

    for opt, arg in opts:
        if opt == '-n':
            share = arg
        elif opt == '-u':
            user = arg
        elif opt == '-s':
            size = arg
        elif opt in ("-r", "--remove"):
            create = False

    if share == "" or user == "":
        usage()
        sys.exit(1)

    if create:
        print cephfs.create_share(share, user, size=size)
    else:
        cephfs.delete_share(share, user)

判断命令行参数。如果是create这通过python的cephfs模块创建pv。
如果delete则调用python的cephfs模块删除pv

扩展

cephfs中的配额设置

  • ceph.quota.max_files 限制目录的文件数
  • ceph.quota.max_bytes 限制目录大小

设置配额

setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir     # 100 MB
setfattr -n ceph.quota.max_files -v 10000 /some/dir         # 10,000 files

查看配额,如果是0的话,配额未设置

getfattr -n ceph.quota.max_bytes /some/dir
getfattr -n ceph.quota.max_files /some/dir

移除配额 ``` setfattr -n ceph.quota.max_bytes -v 0 /some/dir setfattr -n ceph.quota.max_files -v 0 /some/dir