Understanding the Linux kernel 笔记

#用户态&内核态

#解析

经常会提到 用户态, 内核态, 用户空间, 内核空间 这几个概念,但是对其理解并不深刻,只有一个感性的认识,知其然,不知其所以然。

下面这个程序是进行归并排序,数据量为5000000

当前排序操作不涉及外部存储,不存在系统调用,通过top 命令可以看到,cpu 99.7 us。

其中,第一项99.7 us(user 的缩写)就是 CPU 消耗在 User space 的时间百分比。第二项0.3 sy(system 的缩写)是消耗在 Kernel space 的时间百分比。

  • ni:niceness 的缩写,CPU 消耗在 nice 进程(低优先级)的时间百分比
  • id:idle 的缩写,CPU 消耗在闲置进程的时间百分比,这个值越低,表示 CPU 越忙
  • wa:wait 的缩写,CPU 等待外部 I/O 的时间百分比,这段时间 CPU 不能干其他事,但是也没有执行运算,这个值太高就说明外部设备有问题
  • hi:hardware interrupt 的缩写,CPU 响应硬件中断请求的时间百分比
  • si:software interrupt 的缩写,CPU 响应软件中断请求的时间百分比
  • st:stole time 的缩写,该项指标只对虚拟机有效,表示分配给当前虚拟机的 CPU 时间之中,被同一台物理机上的其他虚拟机偷走的时间百分比

以下命令生成10G文件,可以看到 wa 的值为97.9
dd if=/dev/zero of=/root/randomfile bs=1M count=10240

下面的程序,既有计算的操作也有写文件的操作,在用户空间和内核空间进行切换操作。在程序优化上可将这类操作进行分块进行,防止程序进行大量的上下文切换,可以提高性能。
具体优化方法,我还不知道~

1
2
3
4
5
6
7
for i in range(1000000):
i+1 #用户空间
fd = open("test.file","a+")
fd.write(str(i)) #内核空间
fd.write("\n") #内核空间
fd.close()
i*i #用户空间
1
2
# time python test.py
python test.py 2.24s user 4.74s system 98% cpu 7.067 total

#References

http://www.ruanyifeng.com/blog/2016/12/user_space_vs_kernel_space.html

#进程&线程

#进程与线程的主要区别

  1. 线程共享创建它的进程的地址空间,而进程有独立的地址空间
  2. 线程对所在进程中的数据段可以直接访问,子进程具有父进程的数据段副本
  3. 线程能与其它同一进程内的线程进行通信,进程间使用进程间通信手段进行通信
    • 进程间通信:
      • 管道和FIFO
      • 信号量
      • 消息
      • 共享内存区 最高效的进程通信方式,zero copy?
      • 套接字 socket
  4. 线程几乎没有开销,进程有相当大的开销
  5. 新线程很容易创建,新进程需要父进程进行复制操作
  6. 线程可以对同一进程下的其他线程进行控制,进程只能控制其子进程
  7. 对主线线程的更改(撤销,优先级更改)可能会影响进程中的其他线程,对父进程进行修改不会影响子进程。

#参考链接

https://stackoverflow.com/questions/200469/what-is-the-difference-between-a-process-and-a-thread

#第三章 进程

#进程描述符

进程描述符是task_struct类型结构体。该结构体的定义的文件位置是
/Users/matianqi/Project/Linux_Kernel/linux-2.6.11/include/linux/sched.h
具体定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info; /* 进程的基本信息 */
atomic_t usage;
unsigned long flags; /* per process flags, defined below */
unsigned long ptrace;

int lock_depth; /* Lock depth */

int prio, static_prio;
struct list_head run_list;
prio_array_t *array;

unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
int activated;

unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;

#ifdef CONFIG_SCHEDSTATS
struct sched_info sched_info;
#endif

struct list_head tasks;
/*
* ptrace_list/ptrace_children forms the list of my children
* that were stolen by a ptracer.
*/
struct list_head ptrace_children;
struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;

/* task state */
struct linux_binfmt *binfmt;
long exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned long personality;
unsigned did_exec:1;
pid_t pid;
pid_t tgid;
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->parent->pid)
*/
struct task_struct *real_parent; /* real parent process (when being debugged) */
struct task_struct *parent; /* parent process */
/*
* children/sibling forms the list of my children plus the
* tasks I'm ptracing.
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

/* PID/PID hash table linkage. */
struct pid pids[PIDTYPE_MAX];

struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */

unsigned long rt_priority;
unsigned long it_real_value, it_real_incr;
cputime_t it_virt_value, it_virt_incr;
cputime_t it_prof_value, it_prof_incr;
struct timer_list real_timer;
cputime_t utime, stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
/* process credentials */
uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid;
struct group_info *group_info;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
unsigned keep_capabilities:1;
struct user_struct *user;
#ifdef CONFIG_KEYS
struct key *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process (CLONE_THREAD) */
struct key *thread_keyring; /* keyring private to this thread */
#endif
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN];
/* file system info */
int link_count, total_link_count;
/* ipc stuff */
struct sysv_sem sysvsem;
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespace */
struct namespace *namespace;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;

sigset_t blocked, real_blocked;
struct sigpending pending;

unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;

void *security;
struct audit_context *audit_context;

/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
spinlock_t proc_lock;
/* context-switch lock */
spinlock_t switch_lock;

/* journalling filesystem info */
void *journal_info;

/* VM state */
struct reclaim_state *reclaim_state;

struct dentry *proc_dentry;
struct backing_dev_info *backing_dev_info;

struct io_context *io_context;

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
/*
* current io wait handle: wait queue entry to use for io waits
* If this thread is processing aio, this points at the waitqueue
* inside the currently handled kiocb. It may be NULL (i.e. default
* to a stack based synchronous wait) if its doing sync IO.
*/
wait_queue_t *io_wait;
/* i/o counters(bytes read/written, #syscalls */
u64 rchar, wchar, syscr, syscw;
#if defined(CONFIG_BSD_PROCESS_ACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
clock_t acct_stimexpd; /* clock_t-converted stime since last update */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
};

可以看到task_struct的定义有161行。

#进程状态

在该文件中有进程状态的宏定义

1
2
3
4
5
6
7
#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE 1
#define TASK_UNINTERRUPTIBLE 2
#define TASK_STOPPED 4
#define TASK_TRACED 8
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32

可以将以上的宏赋值给进程描述符的state字段,从而标示进程状态。

p->state = TASK_RUNNING

#疑问

  1. volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 中表示 state 的值大于0 时为stopped状态。这个stopped和 TASK_STOPPED有什么区别?