linux 系统调用fork vfork clone
fork:通过fork创建新进程vfork:主要用于马上执行exec的情况,因为马上就exec装入新的程序,所以可以去掉fork中分配新的地址空间等操作,进而加速进程创建clone:功能更强,参数更多;主要用于创建线程/父子进程资源共享等,可以通过设置相应的参数实现fork、vfork的功能 I.系统调用i.系统调用arch/x86/kernel/syscall_table
fork:通过fork创建新进程
vfork:主要用于马上执行exec的情况,因为马上就exec装入新的程序,所以可以去掉fork中分配新的地址空间等操作,进而加速进程创建
clone:功能更强,参数更多;主要用于创建线程/父子进程资源共享等,可以通过设置相应的参数实现fork、vfork的功能
I.系统调用
i.系统调用
arch/x86/kernel/syscall_table_32.S
1 ENTRY(sys_call_table)
4 .long ptregs_fork /* 3 */
122 .long ptregs_clone /* 120 */
192 .long ptregs_vfork /* 190 */
arch/x86/kernel/entry_32.S
709 /*
710 * System calls that need a pt_regs pointer.
711 */
712 #define PTREGSCALL(name) \
713 ALIGN; \
714 ptregs_##name: \
715 leal 4(%esp),%eax; \
716 jmp sys_##name;
717
718 PTREGSCALL(iopl)
719 PTREGSCALL(fork)
720 PTREGSCALL(clone)
721 PTREGSCALL(vfork)
arch/x86/kernel/process.c
217 int sys_fork(struct pt_regs *regs)
218 {
219 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
220 }
221
222 /*
223 * This is trivial, and on the face of it looks like it
224 * could equally well be done in user mode.
225 *
226 * Not so, for quite unobvious reasons - register pressure.
227 * In user mode vfork() cannot have a stack frame, and if
228 * done by calling the "clone()" system call directly, you
229 * do not have enough call-clobbered registers to hold all
230 * the information you need.
231 */
232 int sys_vfork(struct pt_regs *regs)
233 {
234 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
235 NULL, NULL);
236 }
arch/x86/kernel/process_32.c
432 int sys_clone(struct pt_regs *regs)
433 {
434 unsigned long clone_flags;
435 unsigned long newsp;
436 int __user *parent_tidptr, *child_tidptr;
437
438 clone_flags = regs->bx;
439 newsp = regs->cx;
440 parent_tidptr = (int __user *)regs->dx;
441 child_tidptr = (int __user *)regs->di;
442 if (!newsp)
443 newsp = regs->sp;
444 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
445 }
可以看出fork,vfork,clone都是通过do_fork实现,只是传入参数不同而已
ii.do_fork
kernel/fork.c
1363 * Ok, this is the main fork-routine.
1364 *
1365 * It copies the process, and if successful kick-starts
1366 * it and waits for it to finish using the VM if required.
1367 */
1368 long do_fork(unsigned long clone_flags,
1369 unsigned long stack_start,
1370 struct pt_regs *regs,
1371 unsigned long stack_size,
1372 int __user *parent_tidptr,
1373 int __user *child_tidptr)
1374 {
1375 struct task_struct *p;
1376 int trace = 0;
1377 long nr;
1378
1379 /*
1380 * Do some preliminary argument and permissions checking before we
1381 * actually start allocating stuff
1382 */
1383 if (clone_flags & CLONE_NEWUSER) {
1384 if (clone_flags & CLONE_THREAD)
1385 return -EINVAL;
1386 /* hopefully this check will go away when userns support is
1387 * complete
1388 */
1389 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1390 !capable(CAP_SETGID))
1391 return -EPERM;
1392 }
1393
1394 /*
1395 * We hope to recycle these flags after 2.6.26
1396 */
1397 if (unlikely(clone_flags & CLONE_STOPPED)) {
1398 static int __read_mostly count = 100;
1399
1400 if (count > 0 && printk_ratelimit()) {
1401 char comm[TASK_COMM_LEN];
1402
1403 count--;
1404 printk(KERN_INFO "fork(): process `%s' used deprecated "
1405 "clone flags 0x%lx\n",
1406 get_task_comm(comm, current),
1407 clone_flags & CLONE_STOPPED);
1408 }
1409 }
1410
1411 /*
1412 * When called from kernel_thread, don't do user tracing stuff.
1413 */
1414 if (likely(user_mode(regs)))
1415 trace = tracehook_prepare_clone(clone_flags);
1416
1417 p = copy_process(clone_flags, stack_start, regs, stack_size,
1418 child_tidptr, NULL, trace);
1419 /*
1420 * Do this prior waking up the new thread - the thread pointer
1421 * might get invalid after that point, if the thread exits quickly.
1422 */
1423 if (!IS_ERR(p)) {
1424 struct completion vfork;
1425
1426 trace_sched_process_fork(current, p);
1427
1428 nr = task_pid_vnr(p);
1429
1430 if (clone_flags & CLONE_PARENT_SETTID)
1431 put_user(nr, parent_tidptr);
1432
1433 if (clone_flags & CLONE_VFORK) {
1434 p->vfork_done = &vfork;
1435 init_completion(&vfork);
1436 }
1437
1438 audit_finish_fork(p);
1439 tracehook_report_clone(regs, clone_flags, nr, p);
1440
1441 /*
1442 * We set PF_STARTING at creation in case tracing wants to
1443 * use this to distinguish a fully live task from one that
1444 * hasn't gotten to tracehook_report_clone() yet. Now we
1445 * clear it and set the child going.
1446 */
1447 p->flags &= ~PF_STARTING;
1449 if (unlikely(clone_flags & CLONE_STOPPED)) {
1450 /*
1451 * We'll start up with an immediate SIGSTOP.
1452 */
1453 sigaddset(&p->pending.signal, SIGSTOP);
1454 set_tsk_thread_flag(p, TIF_SIGPENDING);
1455 __set_task_state(p, TASK_STOPPED);
1456 } else {
1457 wake_up_new_task(p, clone_flags);
1458 }
1459
1460 tracehook_report_clone_complete(trace, regs,
1461 clone_flags, nr, p);
1462
1463 if (clone_flags & CLONE_VFORK) {
1464 freezer_do_not_count();
1465 wait_for_completion(&vfork);
1466 freezer_count();
1467 tracehook_report_vfork_done(p, nr);
1468 }
1469 } else {
1470 nr = PTR_ERR(p);
1471 }
1472 return nr;
1473 }
1.输入参数及权限检查
2.复制进程
3.取子进程id
4.如果设置CLONE_PARENT_SETTID,则将子进程id放入parent_tidptr中
5.如果是vfork系统调用/带CLONE_VFORK标识的clone,初始化completion;用于暂停父进程在vfork/clone中,在子进程执行exec/exit后父进程再继续执行
6.唤醒子进程,在copy_thread中设置子进程的运行环境;子进程从ret_from_fork执行,内核堆栈与父进程刚进入fork/vfork/clone的内核堆栈相同(返回值ax设置成0,sp设置成新的用户堆栈),直接退出系统调用即可。
7.如果是vfork系统调用/带CLONE_VFORK标识的clone,等待子进程执行exec/exit
II.复制进程copy_process
kernel/fork.c
973 /*
974 * This creates a new process as a copy of the old one,
975 * but does not actually start it yet.
976 *
977 * It copies the registers, and all the appropriate
978 * parts of the process environment (as per the clone
979 * flags). The actual kick-off is left to the caller.
980 */
981 static struct task_struct *copy_process(unsigned long clone_flags,
982 unsigned long stack_start,
983 struct pt_regs *regs,
984 unsigned long stack_size,
985 int __user *child_tidptr,
986 struct pid *pid,
987 int trace)
988 {
989 int retval;
990 struct task_struct *p;
991 int cgroup_callbacks_done = 0;
992
993 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
994 return ERR_PTR(-EINVAL);
995
996 /*
997 * Thread groups must share signals as well, and detached threads
998 * can only be started up within the thread group.
999 */
1000 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1001 return ERR_PTR(-EINVAL);
1002
1003 /*
1004 * Shared signal handlers imply shared VM. By way of the above,
1005 * thread groups also imply shared VM. Blocking this case allows
1006 * for various simplifications in other code.
1007 */
1008 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1009 return ERR_PTR(-EINVAL);
1010
1011 /*
1012 * Siblings of global init remain as zombies on exit since they are
1013 * not reaped by their parent (swapper). To solve this and to avoid
1014 * multi-rooted process trees, prevent global and container-inits
1015 * from creating siblings.
1016 */
1017 if ((clone_flags & CLONE_PARENT) &&
1018 current->signal->flags & SIGNAL_UNKILLABLE)
1019 return ERR_PTR(-EINVAL);
1020
1021 retval = security_task_create(clone_flags);
1022 if (retval)
1023 goto fork_out;
1024
1025 retval = -ENOMEM;
1026 p = dup_task_struct(current);
1027 if (!p)
1028 goto fork_out;
1029
1030 ftrace_graph_init_task(p);
1031
1032 rt_mutex_init_task(p);
1033
1034 #ifdef CONFIG_PROVE_LOCKING
1035 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
1036 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
1037 #endif
1038 retval = -EAGAIN;
1039 if (atomic_read(&p->real_cred->user->processes) >=
1040 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
1041 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1042 p->real_cred->user != INIT_USER)
1043 goto bad_fork_free;
1044 }
1045
1046 retval = copy_creds(p, clone_flags);
1047 if (retval < 0)
1048 goto bad_fork_free;
1049
1050 /*
1051 * If multiple threads are within copy_process(), then this check
1052 * triggers too late. This doesn't hurt, the check is only there
1053 * to stop root fork bombs.
1054 */
1055 retval = -EAGAIN;
1056 if (nr_threads >= max_threads)
1057 goto bad_fork_cleanup_count;
1058
1059 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1060 goto bad_fork_cleanup_count;
1061
1062 p->did_exec = 0;
1063 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1064 copy_flags(clone_flags, p);
1065 INIT_LIST_HEAD(&p->children);
1066 INIT_LIST_HEAD(&p->sibling);
1067 rcu_copy_process(p);
1068 p->vfork_done = NULL;
1069 spin_lock_init(&p->alloc_lock);
1070
1071 init_sigpending(&p->pending);
1072
1073 p->utime = cputime_zero;
1074 p->stime = cputime_zero;
1075 p->gtime = cputime_zero;
1076 p->utimescaled = cputime_zero;
1077 p->stimescaled = cputime_zero;
1078 p->prev_utime = cputime_zero;
1079 p->prev_stime = cputime_zero;
1080
1081 p->default_timer_slack_ns = current->timer_slack_ns;
1082
1083 task_io_accounting_init(&p->ioac);
1084 acct_clear_integrals(p);
1085
1086 posix_cpu_timers_init(p);
1087
1088 p->lock_depth = -1; /* -1 = no lock */
1089 do_posix_clock_monotonic_gettime(&p->start_time);
1090 p->real_start_time = p->start_time;
1091 monotonic_to_bootbased(&p->real_start_time);
1092 p->io_context = NULL;
1093 p->audit_context = NULL;
1094 cgroup_fork(p);
1095 #ifdef CONFIG_NUMA
1096 p->mempolicy = mpol_dup(p->mempolicy);
1097 if (IS_ERR(p->mempolicy)) {
1098 retval = PTR_ERR(p->mempolicy);
1099 p->mempolicy = NULL;
1100 goto bad_fork_cleanup_cgroup;
1101 }
1102 mpol_fix_fork_child_flag(p);
1103 #endif
1104 #ifdef CONFIG_TRACE_IRQFLAGS
1105 p->irq_events = 0;
1106 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1107 p->hardirqs_enabled = 1;
1108 #else
1109 p->hardirqs_enabled = 0;
1110 #endif
1111 p->hardirq_enable_ip = 0;
1112 p->hardirq_enable_event = 0;
1113 p->hardirq_disable_ip = _THIS_IP_;
1114 p->hardirq_disable_event = 0;
1115 p->softirqs_enabled = 1;
1116 p->softirq_enable_ip = _THIS_IP_;
1117 p->softirq_enable_event = 0;
1118 p->softirq_disable_ip = 0;
1119 p->softirq_disable_event = 0;
1120 p->hardirq_context = 0;
1121 p->softirq_context = 0;
1122 #endif
1123 #ifdef CONFIG_LOCKDEP
1124 p->lockdep_depth = 0; /* no locks held yet */
1125 p->curr_chain_key = 0;
1126 p->lockdep_recursion = 0;
1127 #endif
1128
1129 #ifdef CONFIG_DEBUG_MUTEXES
1130 p->blocked_on = NULL; /* not blocked yet */
1131 #endif
1132
1133 p->bts = NULL;
1134
1135 /* Perform scheduler related setup. Assign this task to a CPU. */
1136 sched_fork(p, clone_flags);
1137
1138 retval = perf_event_init_task(p);
1139 if (retval)
1140 goto bad_fork_cleanup_policy;
1141
1142 if ((retval = audit_alloc(p)))
1143 goto bad_fork_cleanup_policy;
1144 /* copy all the process information */
1145 if ((retval = copy_semundo(clone_flags, p)))
1146 goto bad_fork_cleanup_audit;
1147 if ((retval = copy_files(clone_flags, p)))
1148 goto bad_fork_cleanup_semundo;
1149 if ((retval = copy_fs(clone_flags, p)))
1150 goto bad_fork_cleanup_files;
1151 if ((retval = copy_sighand(clone_flags, p)))
1152 goto bad_fork_cleanup_fs;
1153 if ((retval = copy_signal(clone_flags, p)))
1154 goto bad_fork_cleanup_sighand;
1155 if ((retval = copy_mm(clone_flags, p)))
1156 goto bad_fork_cleanup_signal;
1157 if ((retval = copy_namespaces(clone_flags, p)))
1158 goto bad_fork_cleanup_mm;
1159 if ((retval = copy_io(clone_flags, p)))
1160 goto bad_fork_cleanup_namespaces;
1161 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1162 if (retval)
1163 goto bad_fork_cleanup_io;
1164
1165 if (pid != &init_struct_pid) {
1166 retval = -ENOMEM;
1167 pid = alloc_pid(p->nsproxy->pid_ns);
1168 if (!pid)
1169 goto bad_fork_cleanup_io;
1170
1171 if (clone_flags & CLONE_NEWPID) {
1172 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1173 if (retval < 0)
1174 goto bad_fork_free_pid;
1175 }
1176 }
1177
1178 p->pid = pid_nr(pid);
1179 p->tgid = p->pid;
1180 if (clone_flags & CLONE_THREAD)
1181 p->tgid = current->tgid;
1182
1183 if (current->nsproxy != p->nsproxy) {
1184 retval = ns_cgroup_clone(p, pid);
1185 if (retval)
1186 goto bad_fork_free_pid;
1187 }
1188
1189 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1190 /*
1191 * Clear TID on mm_release()?
1192 */
1193 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1194 #ifdef CONFIG_FUTEX
1195 p->robust_list = NULL;
1196 #ifdef CONFIG_COMPAT
1197 p->compat_robust_list = NULL;
1198 #endif
1199 INIT_LIST_HEAD(&p->pi_state_list);
1200 p->pi_state_cache = NULL;
1201 #endif
1202 /*
1203 * sigaltstack should be cleared when sharing the same VM
1204 */
1205 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1206 p->sas_ss_sp = p->sas_ss_size = 0;
1207
1208 /*
1209 * Syscall tracing should be turned off in the child regardless
1210 * of CLONE_PTRACE.
1211 */
1212 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1213 #ifdef TIF_SYSCALL_EMU
1214 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1215 #endif
1216 clear_all_latency_tracing(p);
1217
1218 /* ok, now we should be set up.. */
1219 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1220 p->pdeath_signal = 0;
1221 p->exit_state = 0;
1222
1223 /*
1224 * Ok, make it visible to the rest of the system.
1225 * We dont wake it up yet.
1226 */
1227 p->group_leader = p;
1228 INIT_LIST_HEAD(&p->thread_group);
1229
1230 /* Now that the task is set up, run cgroup callbacks if
1231 * necessary. We need to run them before the task is visible
1232 * on the tasklist. */
1233 cgroup_fork_callbacks(p);
1234 cgroup_callbacks_done = 1;
1235
1236 /* Need tasklist lock for parent etc handling! */
1237 write_lock_irq(&tasklist_lock);
1238
1239 /* CLONE_PARENT re-uses the old parent */
1240 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1241 p->real_parent = current->real_parent;
1242 p->parent_exec_id = current->parent_exec_id;
1243 } else {
1244 p->real_parent = current;
1245 p->parent_exec_id = current->self_exec_id;
1246 }
1247
1248 spin_lock(¤t->sighand->siglock);
1249
1250 /*
1251 * Process group and session signals need to be delivered to just the
1252 * parent before the fork or both the parent and the child after the
1253 * fork. Restart if a signal comes in before we add the new process to
1254 * it's process group.
1255 * A fatal signal pending means that current will exit, so the new
1256 * thread can't slip out of an OOM kill (or normal SIGKILL).
1257 */
1258 recalc_sigpending();
1259 if (signal_pending(current)) {
1260 spin_unlock(¤t->sighand->siglock);
1261 write_unlock_irq(&tasklist_lock);
1262 retval = -ERESTARTNOINTR;
1263 goto bad_fork_free_pid;
1264 }
1265
1266 if (clone_flags & CLONE_THREAD) {
1267 atomic_inc(¤t->signal->count);
1268 atomic_inc(¤t->signal->live);
1269 p->group_leader = current->group_leader;
1270 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1271 }
1272
1273 if (likely(p->pid)) {
1274 list_add_tail(&p->sibling, &p->real_parent->children);
1275 tracehook_finish_clone(p, clone_flags, trace);
1276
1277 if (thread_group_leader(p)) {
1278 if (clone_flags & CLONE_NEWPID)
1279 p->nsproxy->pid_ns->child_reaper = p;
1280
1281 p->signal->leader_pid = pid;
1282 tty_kref_put(p->signal->tty);
1283 p->signal->tty = tty_kref_get(current->signal->tty);
1284 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1285 attach_pid(p, PIDTYPE_SID, task_session(current));
1286 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1287 __get_cpu_var(process_counts)++;
1288 }
1289 attach_pid(p, PIDTYPE_PID, pid);
1290 nr_threads++;
1291 }
1292
1293 total_forks++;
1294 spin_unlock(¤t->sighand->siglock);
1295 write_unlock_irq(&tasklist_lock);
1296 proc_fork_connector(p);
1297 cgroup_post_fork(p);
1298 perf_event_fork(p);
1299 return p;
1300
1301 bad_fork_free_pid:
1302 if (pid != &init_struct_pid)
1303 free_pid(pid);
1304 bad_fork_cleanup_io:
1305 if (p->io_context)
1306 exit_io_context(p);
1307 bad_fork_cleanup_namespaces:
1308 exit_task_namespaces(p);
1309 bad_fork_cleanup_mm:
1310 if (p->mm)
1311 mmput(p->mm);
1312 bad_fork_cleanup_signal:
1313 if (!(clone_flags & CLONE_THREAD))
1314 __cleanup_signal(p->signal);
1315 bad_fork_cleanup_sighand:
1316 __cleanup_sighand(p->sighand);
1317 bad_fork_cleanup_fs:
1318 exit_fs(p); /* blocking */
1319 bad_fork_cleanup_files:
1320 exit_files(p); /* blocking */
1321 bad_fork_cleanup_semundo:
1322 exit_sem(p);
1323 bad_fork_cleanup_audit:
1324 audit_free(p);
1325 bad_fork_cleanup_policy:
1326 perf_event_free_task(p);
1327 #ifdef CONFIG_NUMA
1328 mpol_put(p->mempolicy);
1329 bad_fork_cleanup_cgroup:
1330 #endif
1331 cgroup_exit(p, cgroup_callbacks_done);
1332 delayacct_tsk_free(p);
1333 module_put(task_thread_info(p)->exec_domain->module);
1334 bad_fork_cleanup_count:
1335 atomic_dec(&p->cred->user->processes);
1336 exit_creds(p);
1337 bad_fork_free:
1338 free_task(p);
1339 fork_out:
1340 return ERR_PTR(retval);
1341 }
i.clone_flags共存标识检查
ii.复制进程描述符dup_task_struct
iii.检查用户当前进程数未超限
iv.复制凭证copy_creds
v.检查当前系统范围内进程数未超限;max_threads在fork_init中初始化,=可用内存/内核椎栈大小/8(每个进程都有自己的内核椎栈,每个椎栈大小是4k或8k),即内核堆栈大小不能超过可用内存的1/8
vi.初始化子进程进程描述符,即与父进程不同的地方复制成新值;清空子进程链表、清空pending的信号、设置时间等
vii.初始化进程调度信息sched_fork
viii.复制信号量撤销链表copy_semundo
ix.复制打开文件信息copy_files
x.复制文件系统信息copy_fs
xi.复制信号处理描述符copy_sighand
xii.复制信号描述符copy_signal
xiii.复制地址空间信息copy_mm
xiv.复制命名空间copy_namespaces
xv.复制子进程运行环境copy_thread
xvi.分配子进程id,并设置相应的进程号pid、线程组长号tgid、线程组长group_leader、父进程real_parent
xvii.将进程链入父进程的子进程链表、进程组成员链表、会话组成员链表、进程链表
ii.dup_task_struct
kernel/fork.c
222 static struct task_struct *dup_task_struct(struct task_struct *orig)
223 {
224 struct task_struct *tsk;
225 struct thread_info *ti;
226 unsigned long *stackend;
227
228 int err;
229
230 prepare_to_copy(orig);
231
232 tsk = alloc_task_struct();
233 if (!tsk)
234 return NULL;
235
236 ti = alloc_thread_info(tsk);
237 if (!ti) {
238 free_task_struct(tsk);
239 return NULL;
240 }
241
242 err = arch_dup_task_struct(tsk, orig);
243 if (err)
244 goto out;
245
246 tsk->stack = ti;
247
248 err = prop_local_init_single(&tsk->dirties);
249 if (err)
250 goto out;
251
252 setup_thread_stack(tsk, orig);
253 stackend = end_of_stack(tsk);
254 *stackend = STACK_END_MAGIC; /* for overflow detection */
255
256 #ifdef CONFIG_CC_STACKPROTECTOR
257 tsk->stack_canary = get_random_int();
258 #endif
259
260 /* One for us, one for whoever does the "release_task()" (usually parent) */
261 atomic_set(&tsk->usage,2);
262 atomic_set(&tsk->fs_excl, 0);
263 #ifdef CONFIG_BLK_DEV_IO_TRACE
264 tsk->btrace_seq = 0;
265 #endif
266 tsk->splice_pipe = NULL;
267
268 account_kernel_stack(ti, 1);
269
270 return tsk;
271
272 out:
273 free_thread_info(ti);
274 free_task_struct(tsk);
275 return NULL;
276 }
1.分配进程描述符
2.分配thread_info及内核堆栈
3.复制父进程描述符信息
4.进程描述符与内核堆栈关联起来
5.复制thread_info信息
6.内核堆栈设置幻数,以防堆栈越界
7.设置进程描述符引用计数
iv.copy_creds
kernel/cred.c
427 /*
428 * Copy credentials for the new process created by fork()
429 *
430 * We share if we can, but under some circumstances we have to generate a new
431 * set.
432 *
433 * The new process gets the current process's subjective credentials as its
434 * objective and subjective credentials
435 */
436 int copy_creds(struct task_struct *p, unsigned long clone_flags)
437 {
438 #ifdef CONFIG_KEYS
439 struct thread_group_cred *tgcred;
440 #endif
441 struct cred *new;
442 int ret;
443
444 mutex_init(&p->cred_guard_mutex);
445
446 p->replacement_session_keyring = NULL;
447
448 if (
449 #ifdef CONFIG_KEYS
450 !p->cred->thread_keyring &&
451 #endif
452 clone_flags & CLONE_THREAD
453 ) {
454 p->real_cred = get_cred(p->cred);
455 get_cred(p->cred);
456 alter_cred_subscribers(p->cred, 2);
457 kdebug("share_creds(%p{%d,%d})",
458 p->cred, atomic_read(&p->cred->usage),
459 read_cred_subscribers(p->cred));
460 atomic_inc(&p->cred->user->processes);
461 return 0;
462 }
463
464 new = prepare_creds();
465 if (!new)
466 return -ENOMEM;
467
468 if (clone_flags & CLONE_NEWUSER) {
469 ret = create_user_ns(new);
470 if (ret < 0)
471 goto error_put;
472 }
473
474 #ifdef CONFIG_KEYS
475 /* new threads get their own thread keyrings if their parent already
476 * had one */
477 if (new->thread_keyring) {
478 key_put(new->thread_keyring);
479 new->thread_keyring = NULL;
480 if (clone_flags & CLONE_THREAD)
481 install_thread_keyring_to_cred(new);
482 }
483
484 /* we share the process and session keyrings between all the threads in
485 * a process - this is slightly icky as we violate COW credentials a
486 * bit */
487 if (!(clone_flags & CLONE_THREAD)) {
488 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
489 if (!tgcred) {
490 ret = -ENOMEM;
491 goto error_put;
492 }
493 atomic_set(&tgcred->usage, 1);
494 spin_lock_init(&tgcred->lock);
495 tgcred->process_keyring = NULL;
496 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
497
498 release_tgcred(new);
499 new->tgcred = tgcred;
500 }
501 #endif
502
503 atomic_inc(&new->user->processes);
504 p->cred = p->real_cred = get_cred(new);
505 alter_cred_subscribers(new, 2);
506 validate_creds(new);
507 return 0;
508
509 error_put:
510 put_cred(new);
511 return ret;
512 }
子进程的主/客体凭证复制自父进程的主体凭证
1.如果是创建线程
父进程的主体凭证引用计数器加1,将子进程的主/客体凭证指向父进程的主体凭证即可
2.如果是创建进程
a.分配凭证描述符并初始化,复制父进程主体凭证、引用计数器置1等
b.子进程主/客体凭证指向新的凭证描述符
用户进程数计数器加1
viii.copy_semundo
/* ipc/sem.c */
1244 /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
1245 * parent and child tasks.
1246 */
1247
1248 int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
1249 {
1250 struct sem_undo_list *undo_list;
1251 int error;
1252
1253 if (clone_flags & CLONE_SYSVSEM) {
1254 error = get_undo_list(&undo_list);
1255 if (error)
1256 return error;
1257 atomic_inc(&undo_list->refcnt);
1258 tsk->sysvsem.undo_list = undo_list;
1259 } else
1260 tsk->sysvsem.undo_list = NULL;
1261
1262 return 0;
1263 }
1.如果与父进程共享信号量,将子进程的信号量撤销链表指向父进程的撤销链表
2.如果不与父进程共享信号量,清空子进程的信号量撤销链表
ix.copy_files
/* kernel/fork.c */
748 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
749 {
750 struct files_struct *oldf, *newf;
751 int error = 0;
752
753 /*
754 * A background process may not have any files ...
755 */
756 oldf = current->files;
757 if (!oldf)
758 goto out;
759
760 if (clone_flags & CLONE_FILES) {
761 atomic_inc(&oldf->count);
762 goto out;
763 }
764
765 newf = dup_fd(oldf, &error);
766 if (!newf)
767 goto out;
768
769 tsk->files = newf;
770 error = 0;
771 out:
772 return error;
773 }
/* fs/file.c */
289 /*
290 * Allocate a new files structure and copy contents from the
291 * passed in files structure.
292 * errorp will be valid only when the returned files_struct is NULL.
293 */
294 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
295 {
296 struct files_struct *newf;
297 struct file **old_fds, **new_fds;
298 int open_files, size, i;
299 struct fdtable *old_fdt, *new_fdt;
300
301 *errorp = -ENOMEM;
302 newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
303 if (!newf)
304 goto out;
305
306 atomic_set(&newf->count, 1);
307
308 spin_lock_init(&newf->file_lock);
309 newf->next_fd = 0;
310 new_fdt = &newf->fdtab;
311 new_fdt->max_fds = NR_OPEN_DEFAULT;
312 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
313 new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
314 new_fdt->fd = &newf->fd_array[0];
315 INIT_RCU_HEAD(&new_fdt->rcu);
316 new_fdt->next = NULL;
317
318 spin_lock(&oldf->file_lock);
319 old_fdt = files_fdtable(oldf);
320 open_files = count_open_files(old_fdt);
321
322 /*
323 * Check whether we need to allocate a larger fd array and fd set.
324 */
325 while (unlikely(open_files > new_fdt->max_fds)) {
326 spin_unlock(&oldf->file_lock);
327
328 if (new_fdt != &newf->fdtab) {
329 free_fdarr(new_fdt);
330 free_fdset(new_fdt);
331 kfree(new_fdt);
332 }
333
334 new_fdt = alloc_fdtable(open_files - 1);
335 if (!new_fdt) {
336 *errorp = -ENOMEM;
337 goto out_release;
338 }
339
340 /* beyond sysctl_nr_open; nothing to do */
341 if (unlikely(new_fdt->max_fds < open_files)) {
342 free_fdarr(new_fdt);
343 free_fdset(new_fdt);
344 kfree(new_fdt);
345 *errorp = -EMFILE;
346 goto out_release;
347 }
348
349 /*
350 * Reacquire the oldf lock and a pointer to its fd table
351 * who knows it may have a new bigger fd table. We need
352 * the latest pointer.
353 */
354 spin_lock(&oldf->file_lock);
355 old_fdt = files_fdtable(oldf);
356 open_files = count_open_files(old_fdt);
357 }
358
359 old_fds = old_fdt->fd;
360 new_fds = new_fdt->fd;
361
362 memcpy(new_fdt->open_fds->fds_bits,
363 old_fdt->open_fds->fds_bits, open_files/8);
364 memcpy(new_fdt->close_on_exec->fds_bits,
365 old_fdt->close_on_exec->fds_bits, open_files/8);
366
367 for (i = open_files; i != 0; i--) {
368 struct file *f = *old_fds++;
369 if (f) {
370 get_file(f);
371 } else {
372 /*
373 * The fd may be claimed in the fd bitmap but not yet
374 * instantiated in the files array if a sibling thread
375 * is partway through open(). So make sure that this
376 * fd is available to the new process.
377 */
378 FD_CLR(open_files - i, new_fdt->open_fds);
379 }
380 rcu_assign_pointer(*new_fds++, f);
381 }
382 spin_unlock(&oldf->file_lock);
383
384 /* compute the remainder to be cleared */
385 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
386
387 /* This is long word aligned thus could use a optimized version */
388 memset(new_fds, 0, size);
389
390 if (new_fdt->max_fds > open_files) {
391 int left = (new_fdt->max_fds-open_files)/8;
392 int start = open_files / (8 * sizeof(unsigned long));
393
394 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
395 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
396 }
397
398 rcu_assign_pointer(newf->fdt, new_fdt);
399
400 return newf;
401
402 out_release:
403 kmem_cache_free(files_cachep, newf);
404 out:
405 return NULL;
406 }
1.如果共享打开文件,将父进程打开文件描述符引用计数器加1
2.如果不共享,复制父进程打开文件描述符;
A.分配打开文件描述符
B.初始化打开文件描述符
a.打开文件描述符引用计数器置1
b.将打开文件描述符表指向预分配的文件描述符表,并初始化描述符表的打开文件位图、exec关闭文件位图、file数组指向预分配结构
C.如果预分配的文件描述符表小于父进程已经打开的文件,则重新分配打开文件描述符表
D.复制父进程打开文件位图、exec关闭文件位图信息
E.将父进程打开的文件引用计数器加1,并添加到子进程打开文件描述符表中
F.将文件描述符表中多出的file数组、打开文件位图、exec关闭文件位图清空
G.将打开文件描述符中的fdt指向文件描述符表
下图表示files_struct与file之间的关系:
/* kernel/fork.c */
728 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
729 {
730 struct fs_struct *fs = current->fs;
731 if (clone_flags & CLONE_FS) {
732 /* tsk->fs is already what we want */
733 write_lock(&fs->lock);
734 if (fs->in_exec) {
735 write_unlock(&fs->lock);
736 return -EAGAIN;
737 }
738 fs->users++;
739 write_unlock(&fs->lock);
740 return 0;
741 }
742 tsk->fs = copy_fs_struct(fs);
743 if (!tsk->fs)
744 return -ENOMEM;
745 return 0;
746 }
1.如果CLONE_FS,文件系统描述符引用计数器加1
2.复制文件系统描述符,包含当前目录及根目录信息
xi.copy_sighand
/* kernel/fork.c */
800 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
801 {
802 struct sighand_struct *sig;
803
804 if (clone_flags & CLONE_SIGHAND) {
805 atomic_inc(¤t->sighand->count);
806 return 0;
807 }
808 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
809 rcu_assign_pointer(tsk->sighand, sig);
810 if (!sig)
811 return -ENOMEM;
812 atomic_set(&sig->count, 1);
813 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
814 return 0;
815 }
1.如果共享信号处理,父进程信号处理描述符引用计数器加1即可
2.分配信号处理描述符,复制父进程信号处理描述符表;新的信号处理描述符表引用计数器置1,并与子进程关联
xii.copy_signal
/* kernel/fork.c */
857 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
858 {
859 struct signal_struct *sig;
860
861 if (clone_flags & CLONE_THREAD)
862 return 0;
863
864 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
865 tsk->signal = sig;
866 if (!sig)
867 return -ENOMEM;
868
869 atomic_set(&sig->count, 1);
870 atomic_set(&sig->live, 1);
871 init_waitqueue_head(&sig->wait_chldexit);
872 sig->flags = 0;
873 if (clone_flags & CLONE_NEWPID)
874 sig->flags |= SIGNAL_UNKILLABLE;
875 sig->group_exit_code = 0;
876 sig->group_exit_task = NULL;
877 sig->group_stop_count = 0;
878 sig->curr_target = tsk;
879 init_sigpending(&sig->shared_pending);
880 INIT_LIST_HEAD(&sig->posix_timers);
881
882 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
883 sig->it_real_incr.tv64 = 0;
884 sig->real_timer.function = it_real_fn;
885
886 sig->leader = 0; /* session leadership doesn't inherit */
887 sig->tty_old_pgrp = NULL;
888 sig->tty = NULL;
889
890 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
891 sig->gtime = cputime_zero;
892 sig->cgtime = cputime_zero;
893 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
894 sig->prev_utime = sig->prev_stime = cputime_zero;
895 #endif
896 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
897 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
898 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
899 sig->maxrss = sig->cmaxrss = 0;
900 task_io_accounting_init(&sig->ioac);
901 sig->sum_sched_runtime = 0;
902 taskstats_tgid_init(sig);
903
904 task_lock(current->group_leader);
905 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
906 task_unlock(current->group_leader);
907
908 posix_cpu_timers_init_group(sig);
909
910 acct_init_pacct(&sig->pacct);
911
912 tty_audit_fork(sig);
913
914 sig->oom_adj = current->signal->oom_adj;
915
916 return 0;
917 }
1.如果创建线程,跳过
2.分配信号描述符
3.初始化信号描述符,引用计数器置1、清空共享信号pending
xiii.copy_mm
/* kernel/fork.c */
681 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
682 {
683 struct mm_struct * mm, *oldmm;
684 int retval;
685
686 tsk->min_flt = tsk->maj_flt = 0;
687 tsk->nvcsw = tsk->nivcsw = 0;
688 #ifdef CONFIG_DETECT_HUNG_TASK
689 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
690 #endif
691
692 tsk->mm = NULL;
693 tsk->active_mm = NULL;
694
695 /*
696 * Are we cloning a kernel thread?
697 *
698 * We need to steal a active VM for that..
699 */
700 oldmm = current->mm;
701 if (!oldmm)
702 return 0;
703
704 if (clone_flags & CLONE_VM) {
705 atomic_inc(&oldmm->mm_users);
706 mm = oldmm;
707 goto good_mm;
708 }
709
710 retval = -ENOMEM;
711 mm = dup_mm(tsk);
712 if (!mm)
713 goto fail_nomem;
714
715 good_mm:
716 /* Initializing for Swap token stuff */
717 mm->token_priority = 0;
718 mm->last_interval = 0;
719
720 tsk->mm = mm;
721 tsk->active_mm = mm;
722 return 0;
723
724 fail_nomem:
725 return retval;
726 }
1.分配地址空间描述符mm_struct
2.复制父进程地址空间描述符信息
3.初始化子进程地址空间描述符,覆盖从父进程地址空间描述符那里复制的部分;如引用数、map读写信号等
4.可执行文件引用计数器加1,以便在父进程退出后可执行文件也不会关闭
5.复制地址空间的映射区
遍历地址空间段
A.分配地址空间段描述符
B.复制地址空间段描述符信息
C.初始化地址空间段描述符,如将描述符从父进程链表&红黑树中删除、指向子进程的地址空间描述符等
D.如果是文件映射;文件引用计数器加1等
E.将地址空间段描述符添加到描述符链表及描述符红黑树中
F.映射区计数加1
G.复制页表copy_page_range
修改可写页表项,添加页保护标志,当往该页写数据时产生缺页异常,缺页异常处理会为进程分配新的页帧,这就是COW(copy on write)技术,这样做即加速了fork过程并且又节省了内存;而只读页帧会在进程间共享,如代码段
dup_mm->dup_mmap->copy_page_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_one_pte->ptep_set_wrprotect&pte_wrprotect
H.回调地址空间的open函数
xiv.copy_namespaces
/* kernel/nsproxy.c */
103 /*
104 * called from clone. This now handles copy for nsproxy and all
105 * namespaces therein.
106 */
107 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
108 {
109 struct nsproxy *old_ns = tsk->nsproxy;
110 struct nsproxy *new_ns;
111 int err = 0;
112
113 if (!old_ns)
114 return 0;
115
116 get_nsproxy(old_ns);
117
118 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
119 CLONE_NEWPID | CLONE_NEWNET)))
120 return 0;
121
122 if (!capable(CAP_SYS_ADMIN)) {
123 err = -EPERM;
124 goto out;
125 }
126
127 /*
128 * CLONE_NEWIPC must detach from the undolist: after switching
129 * to a new ipc namespace, the semaphore arrays from the old
130 * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
131 * means share undolist with parent, so we must forbid using
132 * it along with CLONE_NEWIPC.
133 */
134 if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
135 err = -EINVAL;
136 goto out;
137 }
138
139 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
140 if (IS_ERR(new_ns)) {
141 err = PTR_ERR(new_ns);
142 goto out;
143 }
144
145 tsk->nsproxy = new_ns;
146
147 out:
148 put_nsproxy(old_ns);
149 return err;
150 }
1.如果clone_flags没有使用新命名空间标识,直接返回
2.根据clone_flags中使用新命名空间标识,创建新的命名空间;修改子进程的命名空间nsproxy为新命名空间
xv.复制子进程运行环境copy_thread
/* arch/x86/kernel/process_32.c */
242 int copy_thread(unsigned long clone_flags, unsigned long sp,
243 unsigned long unused,
244 struct task_struct *p, struct pt_regs *regs)
245 {
246 struct pt_regs *childregs;
247 struct task_struct *tsk;
248 int err;
249
250 childregs = task_pt_regs(p);
251 *childregs = *regs;
252 childregs->ax = 0;
253 childregs->sp = sp;
254
255 p->thread.sp = (unsigned long) childregs;
256 p->thread.sp0 = (unsigned long) (childregs+1);
257
258 p->thread.ip = (unsigned long) ret_from_fork;
259
260 task_user_gs(p) = get_user_gs(regs);
261
262 tsk = current;
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL);
266 if (!p->thread.io_bitmap_ptr) {
267 p->thread.io_bitmap_max = 0;
268 return -ENOMEM;
269 }
270 set_tsk_thread_flag(p, TIF_IO_BITMAP);
271 }
272
273 err = 0;
274
275 /*
276 * Set a new TLS for the child thread?
277 */
278 if (clone_flags & CLONE_SETTLS)
279 err = do_set_thread_area(p, -1,
280 (struct user_desc __user *)childregs->si, 0);
281
282 if (err && p->thread.io_bitmap_ptr) {
283 kfree(p->thread.io_bitmap_ptr);
284 p->thread.io_bitmap_max = 0;
285 }
286
287 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
288 p->thread.ds_ctx = NULL;
289
290 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
291 p->thread.debugctlmsr = 0;
292
293 return err;
294 }
1.将父进程内核堆栈中保存的寄存器值,复制到子进程的内核堆栈中;以便子进程在fork返回时与原进程状态一致
2.设置子进程fork/vfork/clone返回值为0(系统调用返回值放在ax中)
3.设置子进程用户空间的堆栈;fork、vfork的父进程与子进程的堆栈值一样,因为他们使用不同的进程地址空间,所以不会存在冲突;
4.设置子进程的内核堆栈,因为子进程直接从fork返回,所以将栈顶设置成保存寄存器的起始位置
5.设置子进程被调用后从ret_from_fork开始执行
dup_task_struct及copy_thread之后进程描述符及内核堆栈,如下图所示:
xvi.alloc_pid
/* kernel/pid.c */
245 struct pid *alloc_pid(struct pid_namespace *ns)
246 {
247 struct pid *pid;
248 enum pid_type type;
249 int i, nr;
250 struct pid_namespace *tmp;
251 struct upid *upid;
252
253 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
254 if (!pid)
255 goto out;
256
257 tmp = ns;
258 for (i = ns->level; i >= 0; i--) {
259 nr = alloc_pidmap(tmp);
260 if (nr < 0)
261 goto out_free;
262
263 pid->numbers[i].nr = nr;
264 pid->numbers[i].ns = tmp;
265 tmp = tmp->parent;
266 }
267
268 get_pid_ns(ns);
269 pid->level = ns->level;
270 atomic_set(&pid->count, 1);
271 for (type = 0; type < PIDTYPE_MAX; ++type)
272 INIT_HLIST_HEAD(&pid->tasks[type]);
273
274 spin_lock_irq(&pidmap_lock);
275 for (i = ns->level; i >= 0; i--) {
276 upid = &pid->numbers[i];
277 hlist_add_head_rcu(&upid->pid_chain,
278 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
279 }
280 spin_unlock_irq(&pidmap_lock);
281
282 out:
283 return pid;
284
285 out_free:
286 while (++i <= ns->level)
287 free_pidmap(pid->numbers + i);
288
289 kmem_cache_free(ns->pid_cachep, pid);
290 pid = NULL;
291 goto out;
292 }
1.分配pid,从pidmap中查找空闲id,并赋给pid
2.将pid链入pid哈希表中,方便根据id打到进程描述符
pid分配如下图所示:
xvii.attach_pid
/* kernel/pid.c */
315 /*
316 * attach_pid() must be called with the tasklist_lock write-held.
317 */
318 void attach_pid(struct task_struct *task, enum pid_type type,
319 struct pid *pid)
320 {
321 struct pid_link *link;
322
323 link = &task->pids[type];
324 link->pid = pid;
325 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
326 }
1.取进程描述符中类型为type的pid_link
2.将link->pid置成pid
3.将link链入相应的pid中
下图表示进程添加到相应的进程、进程组、会话组后的关系:
更多推荐
所有评论(0)