6-21 993 views
引子
在分布式系统中,保证客户端的缓存一致性非常重要。
假设有两个客户端Client A和Client B。集群的文件系统中有这么一个目录:/a/b/c/d/。
Client A创建文件/a/b/c/d/a.txt成功,同时缓存了这个目录。
Client B移动目录’d’到/a/下面。目录变成/a/b/c/和/a/d/。
Client A尝试创建文件/a/b/c/d/b.txt。
为了性能考虑,Client A会之间从缓存中读取目录信息而不是每次都通过网络询问。
Client A从缓存中得知,目录/a/b/c/d/存在,于是获取’d’的Inode并发送创建文件请求(在目录’d’下创建b.txt)。服务器响应请求并在/a/d/下创建b.txt。
很明显,这种行为是错误的,因为目录/a/b/c/d/已经不再存在。
经过测试,Ceph很好地解决了这一问题。
代码分析
首先看open函数:
int Client::open(const char *relpath, int flags, const UserPerm &perms, mode_t mode, int stripe_unit, int stripe_count, int object_size, const char *data_pool) { ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl; std::lock_guard lock(client_lock); tout(cct) << "open" << std::endl; tout(cct) << relpath << std::endl; tout(cct) << ceph_flags_sys2wire(flags) << std::endl; if (unmounting) return -ENOTCONN; Fh *fh = NULL;//file handle for any open file state #if defined(__linux__) && defined(O_PATH) /* When the O_PATH is being specified, others flags than O_DIRECTORY * and O_NOFOLLOW are ignored. Please refer do_entry_open() function * in kernel (fs/open.c). */ if (flags & O_PATH) flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; #endif filepath path(relpath); InodeRef in; bool created = false; /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */ bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));///能不能是符号链接 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode)); if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) return -EEXIST; #if defined(__linux__) && defined(O_PATH) if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH)) #else if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW)) #endif return -ELOOP; if (r == -ENOENT && (flags & O_CREAT)) { filepath dirpath = path; string dname = dirpath.last_dentry(); dirpath.pop_dentry(); InodeRef dir; r = path_walk(dirpath, &dir, perms, true, cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0); if (r < 0) goto out; if (cct->_conf->client_permissions) { r = may_create(dir.get(), perms); if (r < 0) goto out; } r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit, stripe_count, object_size, data_pool, &created, perms); } if (r < 0) goto out; if (!created) { // posix says we can only check permissions of existing files if (cct->_conf->client_permissions) { r = may_open(in.get(), flags, perms); if (r < 0) goto out; } } if (!fh) r = _open(in.get(), flags, mode, &fh, perms); if (r >= 0) { // allocate a integer file descriptor ceph_assert(fh); r = get_fd(); ceph_assert(fd_map.count(r) == 0); fd_map[r] = fh; } out: tout(cct) << r << std::endl; ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl; return r; }
其中path_walk
函数用于寻找路径,返回最终的InodeRef。
int Client::path_walk(const filepath &origpath, InodeRef *end, const UserPerm &perms, bool followsym, int mask) { filepath path = origpath; InodeRef cur; if (origpath.absolute()) cur = root; else cur = cwd; ceph_assert(cur); ldout(cct, 10) << __func__ << " " << path << dendl; int symlinks = 0; unsigned i = 0; while (i < path.depth() && cur)//执行parse_bits()返回bits.size() { int caps = 0; const string &dname = path[i]; ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl; ldout(cct, 20) << " (path is " << path << ")" << dendl; InodeRef next; if (cct->_conf->client_permissions) { int r = may_lookup(cur.get(), perms); if (r < 0) return r; caps = CEPH_CAP_AUTH_SHARED; } /* Get extra requested caps on the last component */ if (i == (path.depth() - 1)) caps |= mask; int r = _lookup(cur.get(), dname, caps, &next, perms); if (r < 0) return r; // only follow trailing symlink if followsym. always follow // 'directory' symlinks. if (next && next->is_symlink()) { symlinks++; ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl; if (symlinks > MAXSYMLINKS) { return -ELOOP; } if (i < path.depth() - 1) { // dir symlink // replace consumed components of path with symlink dir target filepath resolved(next->symlink.c_str()); resolved.append(path.postfixpath(i + 1)); path = resolved; i = 0; if (next->symlink[0] == '/') { cur = root; } continue; } else if (followsym) { if (next->symlink[0] == '/') { path = next->symlink.c_str(); i = 0; // reset position cur = root; } else { filepath more(next->symlink.c_str()); // we need to remove the symlink component from off of the path // before adding the target that the symlink points to. remain // at the same position in the path. path.pop_dentry(); path.append(more); } continue; } } cur.swap(next); i++; } if (!cur) return -ENOENT; if (end) end->swap(cur); return 0; }
其中_lookup
函数用于寻找下一级目录。
int Client::_lookup(Inode *dir, const string &dname, int mask, InodeRef *target, const UserPerm &perms) { int r = 0; Dentry *dn = NULL; if (dname == "..") { if (dir->dentries.empty())//xlist<Dentry *> dentries; // if i'm linked to a dentry.缓存不存在 { MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT); filepath path(dir->ino); req->set_filepath(path); InodeRef tmptarget; int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds()); if (r == 0) { Inode *tempino = tmptarget.get(); _ll_get(tempino); *target = tempino; ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl; } else { *target = dir; } } else//缓存存在 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked goto done; } if (dname == ".") { *target = dir; goto done; } if (!dir->is_dir()) { r = -ENOTDIR; goto done; } if (dname.length() > NAME_MAX) { r = -ENAMETOOLONG; goto done; } if (dname == cct->_conf->client_snapdir && dir->snapid == CEPH_NOSNAP) { *target = open_snapdir(dir); goto done; } if (dir->dir &&//Inode -> Dir //if i am a dir dir->dir->dentries.count(dname))//存在名为dname的子目录dentry unordered_map<string, Dentry*> { dn = dir->dir->dentries[dname]; ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl; if (!dn->inode || dn->inode->caps_issued_mask(mask, true))//dentry未link(子目录不存在),或子目录存在并有满足mask的cap { // is dn lease valid? utime_t now = ceph_clock_now(); if (dn->lease_mds >= 0 && dn->lease_ttl > now && mds_sessions.count(dn->lease_mds))//dentry未撤销,未过期 { MetaSession &s = mds_sessions.at(dn->lease_mds); if (s.cap_ttl > now && s.cap_gen == dn->lease_gen)//MetaSession的cap未过期,且此dentry的lease来自这个MetaSession { // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to // make trim_caps() behave. dir->try_touch_cap(dn->lease_mds); goto hit_dn; } ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen << " vs lease_gen " << dn->lease_gen << dendl;. } // dir shared caps? if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) { if (dn->cap_shared_gen == dir->shared_gen && (!dn->inode || dn->inode->caps_issued_mask(mask, true))) goto hit_dn; if (!dn->inode && (dir->flags & I_COMPLETE)) { ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl; return -ENOENT; } } } else { ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl; } } else { // can we conclude ENOENT locally? if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) && (dir->flags & I_COMPLETE)) { ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl; return -ENOENT; } } r = _do_lookup(dir, dname, mask, target, perms);//向MDS查询 goto done; hit_dn: if (dn->inode) { *target = dn->inode; } else { r = -ENOENT; } touch_dn(dn); done: if (r < 0) ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl; else ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl; return r; }
_lookup
函数解析
首先,特判下一级目录为”..”和”.”的情况:
如果是”..”,查上级目录dir的dentry缓存。如果不存在,向MDS查询;如果存在,直接返回父目录。
如果是”.”,直接返回此目录。
1.排除掉一些错误情况后,看dir是否是目录且存在名为dname的dentry?
如果存在,下一步;
如果不存在,查询上级目录是否有CAP_FILE_SHARED。根据上一篇文章所述,如果上级目录拥有此CAP,说明它的目录信息是正确的,此时没查到说明本身就是没有,返回“不存在”。如果没有此CAP,向MDS查询。
2.dentry未link到Inode 或 已经link并且对应Inode有满足mask的cap?
如果是,下一步;
如果不是,向MDS查询。
3.dentry的lease未撤销,未过期且mds_sessions
中有dentry的lease_mds
?
如果是,下一步;
如果不是,跳转到5。
4.MetaSession
的cap未过期且cap_gen
==lease_gen
?(gen大概就是类似指纹一样的东西)
如果是,缓存命中,touch cap和mds;
如果不是,下一步。
5.上级目录是否有CAP_FILE_SHARED?
如果是,下一步;
如果不是,向MDS查询。
6.dn->cap_shared_gen
== dir->shared_gen
?
如果是,缓存命中。和之前同理,有cap且指纹相同能证明本地缓存的正确性。
如果不是,dentry未link则返回不存在,已link则向MDS查询。
总结
回到最开始的问题,当Client B移动目录’d’到/a/的时候,MDS发送消息撤销了Client A上’d’的dentry的lease,使得lease_mds
= -1而验证无法通过,同时撤销相关caps。从而保证客户端缓存一致性。
- Ceph客户端缓存一致性分析 - 2019年6月21日
- CephFS:什么是CAPS? - 2019年6月21日
- Ceph集群快速搭建教程 - 2019年6月20日