Android系统启动流程(五)——init进程对子进程的监控(基于Android13)

init进程会读取rc文件,然后孵化很多其他系统服务进程,为防止子进程死亡后称为僵尸进程,init需要监测子进程是否死亡,如果死亡,则清除子进程资源,并重新拉起进程。
system/core/init/init.cpp

InstallSignalFdHandler(&epoll);

init进程第二阶段启动的时候,创建了epoll,并将子进程相关处理注册到epoll中

static void InstallSignalFdHandler(Epoll* epoll) {
    // Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving
    // SIGCHLD when a child process stops or continues (b/77867680#comment9).
    const struct sigaction act { .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP };
    sigaction(SIGCHLD, &act, nullptr);

    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, SIGCHLD);

    if (!IsRebootCapable()) {
        // If init does not have the CAP_SYS_BOOT capability, it is running in a container.
        // In that case, receiving SIGTERM will cause the system to shut down.
        sigaddset(&mask, SIGTERM);
    }

    if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1) {
        PLOG(FATAL) << "failed to block signals";
    }

    // Register a handler to unblock signals in the child processes.
    const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals);
    if (result != 0) {
        LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
    }

    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
    if (signal_fd == -1) {
        PLOG(FATAL) << "failed to create signalfd";
    }

    constexpr int flags = EPOLLIN | EPOLLPRI;
    auto handler = std::bind(HandleSignalFd, false);
    if (auto result = epoll->RegisterHandler(signal_fd, handler, flags); !result.ok()) {
        LOG(FATAL) << result.error();
    }
}

使用epoll注册监听信号量fd,子进程死亡会发送信号给init进程,这里就会调用epoll中注册的回调函数HandleSignalFd

static void HandleSignalFd(bool one_off) {
    signalfd_siginfo siginfo;
    auto started = std::chrono::steady_clock::now();
    do {
        ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
        if (bytes_read < 0 && errno == EAGAIN) {
            auto now = std::chrono::steady_clock::now();
            std::chrono::duration<double> waited = now - started;
            if (waited >= kDiagnosticTimeout) {
                LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
                started = now;
            }

            std::this_thread::sleep_for(100ms);
            continue;
        }
        if (bytes_read != sizeof(siginfo)) {
            PLOG(ERROR) << "Failed to read siginfo from signal_fd";
            return;
        }
        break;
    } while (!one_off);

    switch (siginfo.ssi_signo) {
        case SIGCHLD:
            ReapAnyOutstandingChildren();
            break;
        case SIGTERM:
            HandleSigtermSignal(siginfo);
            break;
        default:
            PLOG(ERROR) << "signal_fd: received unexpected signal " << siginfo.ssi_signo;
            break;
    }
}

对于SIGCHLD信号,调用ReapAnyOutstandingChildren
system/core/init/sigchld_handler.cpp

void ReapAnyOutstandingChildren() {
      while (ReapOneProcess() != 0) {
      }
  }
static pid_t ReapOneProcess() {
    siginfo_t siginfo = {};
    // This returns a zombie pid or informs us that there are no zombies left to be reaped.
    // It does NOT reap the pid; that is done below.
    if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
        PLOG(ERROR) << "waitid failed";
        return 0;
    }

    auto pid = siginfo.si_pid;
    if (pid == 0) return 0;

    // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
    // whenever the function returns from this point forward.
    // We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we
    // want the pid to remain valid throughout that (and potentially future) usages.
    auto reaper = make_scope_guard([pid] { TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); });

    std::string name;
    std::string wait_string;
    Service* service = nullptr;

    if (SubcontextChildReap(pid)) {
        name = "Subcontext";
    } else {
    	//查找死亡进程pid
        service = ServiceList::GetInstance().FindService(pid, &Service::pid);

        if (service) {
            name = StringPrintf("Service '%s' (pid %d)", service->name().c_str(), pid);
            if (service->flags() & SVC_EXEC) {
                auto exec_duration = boot_clock::now() - service->time_started();
                auto exec_duration_ms =
                    std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration).count();
                wait_string = StringPrintf(" waiting took %f seconds", exec_duration_ms / 1000.0f);
            } else if (service->flags() & SVC_ONESHOT) {
                auto exec_duration = boot_clock::now() - service->time_started();
                auto exec_duration_ms =
                        std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration)
                                .count();
                wait_string = StringPrintf(" oneshot service took %f seconds in background",
                                           exec_duration_ms / 1000.0f);
            }
        } else {
            name = StringPrintf("Untracked pid %d", pid);
        }
    }

    if (siginfo.si_code == CLD_EXITED) {
        LOG(INFO) << name << " exited with status " << siginfo.si_status << wait_string;
    } else {
        LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
    }

    if (!service) {
        LOG(INFO) << name << " did not have an associated service entry and will not be reaped";
        return pid;
    }
	
	//杀死进程,清除资源,并设置重启标志
    service->Reap(siginfo);

    if (service->flags() & SVC_TEMPORARY) {
        ServiceList::GetInstance().RemoveService(*service);
    }

    return pid;
}

调用Reap函数清理子进程
system/core/init/service.cpp

void Service::Reap(const siginfo_t& siginfo) {
	//杀死进程
    if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) {
        KillProcessGroup(SIGKILL, false);
    } else {
        // Legacy behavior from ~2007 until Android R: this else branch did not exist and we did not
        // kill the process group in this case.
        if (SelinuxGetVendorAndroidVersion() >= __ANDROID_API_R__) {
            // The new behavior in Android R is to kill these process groups in all cases.  The
            // 'true' parameter instructions KillProcessGroup() to report a warning message where it
            // detects a difference in behavior has occurred.
            KillProcessGroup(SIGKILL, true);
        }
    }

    // Remove any socket resources we may have created.
    //清除socket节点
    for (const auto& socket : sockets_) {
        if (socket.persist) {
            continue;
        }
        auto path = ANDROID_SOCKET_DIR "/" + socket.name;
        unlink(path.c_str());
    }

    for (const auto& f : reap_callbacks_) {
        f(siginfo);
    }

    if ((siginfo.si_code != CLD_EXITED || siginfo.si_status != 0) && on_failure_reboot_target_) {
        LOG(ERROR) << "Service with 'reboot_on_failure' option failed, shutting down system.";
        trigger_shutdown(*on_failure_reboot_target_);
    }

    if (flags_ & SVC_EXEC) UnSetExec();

    if (name_ == "zygote" || name_ == "zygote64") {
    	//如果是zygote,则清除所有服务进程
        removeAllEmptyProcessGroups();
    }

    if (flags_ & SVC_TEMPORARY) return;

    pid_ = 0;
    flags_ &= (~SVC_RUNNING);
    start_order_ = 0;

    // Oneshot processes go into the disabled state on exit,
    // except when manually restarted.
    if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET)) {
        flags_ |= SVC_DISABLED;
    }

    // Disabled and reset processes do not get restarted automatically.
    if (flags_ & (SVC_DISABLED | SVC_RESET))  {
        NotifyStateChange("stopped");
        return;
    }

#if INIT_FULL_SOURCES
    static bool is_apex_updatable = android::sysprop::ApexProperties::updatable().value_or(false);
#else
    static bool is_apex_updatable = false;
#endif
    const bool is_process_updatable = !use_bootstrap_ns_ && is_apex_updatable;

    // If we crash > 4 times in 'fatal_crash_window_' minutes or before boot_completed,
    // reboot into bootloader or set crashing property
    boot_clock::time_point now = boot_clock::now();
    if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART)) {
        bool boot_completed = GetBoolProperty("sys.boot_completed", false);
        if (now < time_crashed_ + fatal_crash_window_ || !boot_completed) {
            if (++crash_count_ > 4) {
                auto exit_reason = boot_completed ?
                    "in " + std::to_string(fatal_crash_window_.count()) + " minutes" :
                    "before boot completed";
                if (flags_ & SVC_CRITICAL) {
                    if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false)) {
                        // Aborts into `fatal_reboot_target_'.
                        SetFatalRebootTarget(fatal_reboot_target_);
                        LOG(FATAL) << "critical process '" << name_ << "' exited 4 times "
                                   << exit_reason;
                    }
                } else {
                    LOG(ERROR) << "process with updatable components '" << name_
                               << "' exited 4 times " << exit_reason;
                    // Notifies update_verifier and apexd
                    SetProperty("sys.init.updatable_crashing_process_name", name_);
                    SetProperty("sys.init.updatable_crashing", "1");
                }
            }
        } else {
            time_crashed_ = now;
            crash_count_ = 1;
        }
    }

    flags_ &= (~SVC_RESTART);
    //设置重启标志位
    flags_ |= SVC_RESTARTING;

    // Execute all onrestart commands for this service.
    //启动所有rc中标志位onrestart的服务
    onrestart_.ExecuteAllCommands();

    NotifyStateChange("restarting");
    return;
}

这里清除了进程的资源,并设置该进程标志为SVC_RESTARTING,并启动rc中定义的onrestart相关服务

while (true) {
        ...
        if (!IsShuttingDown()) {
            auto next_process_action_time = HandleProcessActions();

            // If there's a process that needs restarting, wake up in time for that.
            if (next_process_action_time) {
                epoll_timeout = std::chrono::ceil<std::chrono::milliseconds>(
                        *next_process_action_time - boot_clock::now());
                if (*epoll_timeout < 0ms) epoll_timeout = 0ms;
            }
        }
        ...
    }

init进程的死循环中HandleProcessActions会处理服务重启

static std::optional<boot_clock::time_point> HandleProcessActions() {
    std::optional<boot_clock::time_point> next_process_action_time;
    //遍历所有的service
    for (const auto& s : ServiceList::GetInstance()) {
        if ((s->flags() & SVC_RUNNING) && s->timeout_period()) {
            auto timeout_time = s->time_started() + *s->timeout_period();
            if (boot_clock::now() > timeout_time) {
                s->Timeout();
            } else {
                if (!next_process_action_time || timeout_time < *next_process_action_time) {
                    next_process_action_time = timeout_time;
                }
            }
        }
		
		//如果没有标志为SVC_RESTARTING,则跳过
        if (!(s->flags() & SVC_RESTARTING)) continue;

        auto restart_time = s->time_started() + s->restart_period();
        if (boot_clock::now() > restart_time) {
        	//调用Start重启服务
            if (auto result = s->Start(); !result.ok()) {
                LOG(ERROR) << "Could not restart process '" << s->name() << "': " << result.error();
            }
        } else {
            if (!next_process_action_time || restart_time < *next_process_action_time) {
                next_process_action_time = restart_time;
            }
        }
    }
    return next_process_action_time;
}

调用Start函数执行

Result<void> Service::Start() {
    ...

    pid_t pid = -1;
    if (namespaces_.flags) {
        pid = clone(nullptr, nullptr, namespaces_.flags | SIGCHLD, nullptr);
    } else {
    	//新建一个进程
        pid = fork();
    }

    if (pid == 0) {
    	//子进程中执行,运行子进程服务
        umask(077);
        RunService(override_mount_namespace, descriptors, std::move(pipefd));
        _exit(127);
    }

    if (pid < 0) {
        pid_ = 0;
        return ErrnoError() << "Failed to fork";
    }

    ...
}

fork子进程,并调用RunService执行service

void Service::RunService(const std::optional<MountNamespace>& override_mount_namespace,
                         const std::vector<Descriptor>& descriptors,
                         std::unique_ptr<std::array<int, 2>, decltype(&ClosePipe)> pipefd) {
    ...
    if (!ExpandArgsAndExecv(args_, sigstop_)) {
        PLOG(ERROR) << "cannot execv('" << args_[0]
                    << "'). See the 'Debugging init' section of init's README.md for tips";
    }
}

调用ExpandArgsAndExecv执行

static bool ExpandArgsAndExecv(const std::vector<std::string>& args, bool sigstop) {
    std::vector<std::string> expanded_args;
    std::vector<char*> c_strings;

    expanded_args.resize(args.size());
    c_strings.push_back(const_cast<char*>(args[0].data()));
    for (std::size_t i = 1; i < args.size(); ++i) {
        auto expanded_arg = ExpandProps(args[i]);
        if (!expanded_arg.ok()) {
            LOG(FATAL) << args[0] << ": cannot expand arguments': " << expanded_arg.error();
        }
        expanded_args[i] = *expanded_arg;
        c_strings.push_back(expanded_args[i].data());
    }
    c_strings.push_back(nullptr);

    if (sigstop) {
        kill(getpid(), SIGSTOP);
    }

    return execv(c_strings[0], c_strings.data()) == 0;
}

调用execv执行可执行文件

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值