问题现象:
实体机没问题,容器运行命令有特权(--privileged)的时候也没问题,而没有特权的时候,执行fakeroot会报错,容器执行命令如下:
docker run --device /dev/fuse -dit --cap-add SYS_ADMIN -p 8060:22 -v /tmp:/tmp -v /dev/fuse:/dev/fuse --name=xxx 一个ubuntu20的镜像
执行fakeroot(fakeroot版本1.37)的命令如下:(增加了strace辅助分析):
LD_PRELOAD=/tmp/libpreload-semop.so \
strace -f fakeroot -i .fakeroot.env -s .fakeroot.env /bin/bash -c debootstrap/debootstrap --verbose --second-stage --keep-debootstrap-dir
卡住前的输入:
[pid 13] read(3, <unfinished ...>
[pid 21] <... prlimit64 resumed>{rlim_cur=1024, rlim_max=512*1024}) = 0
[pid 20] <... exit_group resumed>) = ?
[pid 21] close_range(0, 1024, 0) = -1 EPERM (Operation not permitted)
[pid 21] setsid( <unfinished ...>
[pid 20] +++ exited with 0 +++
[pid 19] <... wait4 resumed>[{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 20
[pid 19] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=20, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
[pid 19] rt_sigreturn({mask=[]}) = 20
[pid 19] wait4(-1, 0x7ffd3f022a8c, WNOHANG, NULL) = -1 ECHILD (No child processes)
[pid 19] dup2(10, 0) = 0
[pid 19] close(10 <unfinished ...>
[pid 21] <... setsid resumed>) = 21
[pid 19] <... close resumed>) = 0
[pid 21] msgrcv(32814, <unfinished ...>
[pid 19] exit_group(0) = ?
[pid 19] +++ exited with 0 +++
[pid 13] <... read resumed>0x7ffd3f023060, 128) = ? ERESTARTSYS (To be restarted if SA_RESTART is set)
[pid 13] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=19, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
[pid 13] rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
[pid 13] read(3,
问题分析:
看了比较多的描述,有文件描述符上限的问题,可以升级fakeroot版本,但是我的版本以及大于1.35了,看到close_range(0, 1024, 0) = -1 EPERM (Operation not permitted),应该就是这里的权限问题了
处理方法:
注意这只是一种潜在的处理方法,拦截并降级close_range调用
#define _GNU_SOURCE
#include <dlfcn.h>
#include <fcntl.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <sys/syscall.h>
#include <linux/sem.h>
#include <stdio.h>
#include <errno.h>
#include <dirent.h>
#include <stdlib.h>
/* glibc 2.31 wraps semop() as a call to semtimedop() with the timespec set to NULL
* qemu 3.1 doesn't support semtimedop(), so this wrapper syscalls the real semop()
*/
int semop(int semid, struct sembuf *sops, unsigned nsops)
{
return syscall(__NR_semop, semid, sops, nsops);
}
static int (*original_fcntl64)(int fd, int cmd, ...);
int fcntl64(int fd, int cmd, ...) {
va_list ap;
void *arg;
// 解析可变参数
va_start(ap, cmd);
arg = va_arg(ap, void *);
va_end(ap);
// 延迟绑定原始函数(只需一次)
if (!original_fcntl64) {
original_fcntl64 = dlsym(RTLD_NEXT, "fcntl64");
}
// 核心:仅精准拦截 OFD 写锁(尝试阻塞获取)
if (cmd == F_OFD_SETLKW || cmd == F_OFD_SETLK) {
// 进一步检查是否为 /etc/.pwd.lock 文件
//char path_buf[256];
//if (get_fd_path(fd, path_buf, sizeof(path_buf)) == 0) {
//if (strstr(path_buf, "/etc/.pwd.lock")) {
// 降级:将 OFD 锁转换为传统进程锁
int traditional_cmd = (cmd == F_OFD_SETLKW) ? F_SETLKW : F_SETLK;
//fprintf(stderr, "[INFO] Intercepting OFD lock on %s, downgrading command %d -> %d\n",
// path_buf, cmd, traditional_cmd);
fprintf(stderr, "[INFO] Intercepting OFD lock downgrading\n");
// 传递相同的 flock 结构体参数
return original_fcntl64(fd, traditional_cmd, arg);
//}
// }
}
// 默认情况:原样传递所有参数
return original_fcntl64(fd, cmd, arg);
}
// 降级实现:使用传统的close()逐个关闭
static int fallback_close_range(unsigned int first, unsigned int last, unsigned int flags) {
int ret = 0;
int saved_errno = 0;
// 简化实现:忽略flags参数,只实现基本功能
for (unsigned int fd = first; fd <= last; fd++) {
// 尝试关闭文件描述符
int close_ret = close(fd);
if (close_ret < 0 && errno != EBADF) {
// 记录第一个非EBADF错误
if (ret == 0) {
saved_errno = errno;
ret = -1;
}
}
}
if (ret < 0) {
errno = saved_errno;
}
return ret;
}
static int optimized_fallback_close_range(unsigned int first, unsigned int last, unsigned int flags) {
// 尝试使用/proc/self/fd来获取实际打开的文件描述符
DIR *dir = opendir("/proc/self/fd");
if (dir) {
struct dirent *entry;
int dir_fd = dirfd(dir);
int ret = 0;
int saved_errno = 0;
while ((entry = readdir(dir)) != NULL) {
// 跳过"."和".."
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
continue;
}
char *endptr;
long fd = strtol(entry->d_name, &endptr, 10);
if (endptr == entry->d_name || *endptr != '\0') {
continue; // 不是数字
}
// 检查是否在范围内且不是目录本身
if (fd >= (long)first && fd <= (long)last && fd != dir_fd) {
if (close(fd) < 0 && errno != EBADF) {
if (ret == 0) {
saved_errno = errno;
ret = -1;
}
}
}
}
closedir(dir);
if (ret < 0) {
errno = saved_errno;
}
return ret;
}
// 如果无法读取/proc,回退到简单实现
return fallback_close_range(first, last, flags);
}
// 原始系统调用函数指针
static int (*orig_close_range)(unsigned int, unsigned int, unsigned int) = NULL;
// 我们的包装函数
int close_range(unsigned int first, unsigned int last, unsigned int flags) {
// 延迟绑定原始函数(只需一次)
if (!orig_close_range) {
orig_close_range = dlsym(RTLD_NEXT, "close_range");
}
// 如果原始函数可用,先尝试使用
// if (orig_close_range) {
// int ret = orig_close_range(first, last, flags);
// if (ret == 0 || errno != ENOSYS) {
// return ret;
// }
// // ENOSYS表示系统调用未实现,回退到降级实现
// }
// 降级实现
return optimized_fallback_close_range(first, last, flags);
}
编译
gcc -fPIC -shared -o libpreload-semop.so wrap_semop.c
aarch64-linux-gnu-gcc -fPIC -shared -o libpreload-semop.so wrap_semop.c

320

被折叠的 条评论
为什么被折叠?



