Contents

file

图解 Linux 文件系统

Nonblocking I/O

从内核文件系统看文件读写过程

files

图解|什么是缺页错误Page Fault

Processes and Tasks

INTRODUCTION TO THE LINUX VIRTUAL FILESYSTEM (VFS) – PART I: A HIGH-LEVEL TOUR

Linux in Depth - 文件系统及 Socket 源码解析

Linux内核Page Cache和Buffer Cache关系及演化历史

深度理解 Linux 读取文件过程!

原来8张图,就可以搞懂「零拷贝」了

Linux I/O 原理和 Zero-copy 技术全面揭秘

操作系统 I/O 全流程详解

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210808/Y3iPSE.jpg https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210709/Xohv5b.jpg https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210808/GuXZrh.jpg https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210806/S4d1Zl.jpg https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210812/EB72WD.jpg

data structure

1
2
openfile table:
file-> node
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// process
struct task_struct {
	// ...
	/* Filesystem information: */
	struct fs_struct		*fs;

	/* Open file information: */
	struct files_struct		*files;
	// ...
}

struct files_struct {
	// ...
	struct fdtable __rcu *fdt;
	// ...
}

struct fdtable {
	unsigned int max_fds;
	struct file __rcu **fd;      /* current fd array */
	unsigned long *close_on_exec;
	unsigned long *open_fds;
	unsigned long *full_fds_bits;
	struct rcu_head rcu;
};

struct file {
    struct path                     f_path;         // a dentry and a mount point which locate this file
    struct inode                    *f_inode;       // the inode underlying this file
    const struct file_operations    *f_op;          // callbacks to function which can operate on this file
    spinlock_t                      f_lock;
    atomic_long_t                   f_count;
    unsigned int                    f_flags;
    fmode_t                         f_mode;
    struct mutex                    f_pos_lock;
    loff_t                          f_pos           // offset in the file from which the next read or write shall commence
    struct fown_struct              f_owner;
    void                            *private_data
    struct address_space            *f_mapping;     // callbacks for memory mapping operations
}; 
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
   struct dentry {
    struct hlist_bl_node            d_hash;                     // lookup hash list
    struct dentry                   *d_parent;	                // parent directory
    struct qstr                     d_name;
    struct inode                    *d_inode;                   // where the name belongs to
    unsigned char                   d_iname[DNAME_INLINE_LEN];  // small names
    const struct dentry_operations  *d_op;
    void                            *d_fsdata;                  // fs-specific data
    struct list_head                d_child;                    // child of parent list, i.e., our siblings
    struct list_head                d_subdirs;                  // our children
};


struct inode {
     __u16 i_mode;     // 模式
     __u16 i_nlinks;   // 链接数
     __u16 i_uid;      // 所属用户UID
     __u16 i_gid;      // 所属组ID
     __u32 i_size;     // 文件大小
     __u32 i_atime;    // 访问时间
     __u32 i_mtime;    // 修改时间
     __u32 i_ctime;    // 创建时间
     __u32 i_zone[10]; // 文件数对应的数据块编号
};

https://cdn.jsdelivr.net/gh/toms2077/imgs@master/20230905/5yZgGRJTAh6H.png

soft link: 类似windows快捷方式,新文件的内容是指向原文件,原文件删除,新文件不可用 hard link: 增加了文件的innode引用数,新文件的内容就是innode, 原文件删除,新文件继续可用

disk vs sockets

  1. 建立映射部分是一样的实现方式

task–> private file table—> file;

  1. 实际的存储部分,也就是inode开始出现差异

是什么

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210816/HkiTBY.jpg

为用户层提供i/o的统一接口;调用底层的实现(ext2,ext4);

  1. char: 连续字节流;键盘/鼠标
  2. block: 被分成很多block;disk/usb;
  3. socket:网络

主要数据结构

  1. task_struct.[files_struct].[fdtable]: openfile table;
  2. fdtable[n]–> file: file;
  3. file–> entry–> inode;
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// process
struct task_struct {
	// ...
	/* Filesystem information: */
	struct fs_struct		*fs;

	/* Open file information: */
	struct files_struct		*files;
	// ...
}

struct files_struct {
	// ...
	struct fdtable __rcu *fdt;
	// ...
}

struct fdtable {
	unsigned int max_fds;
	struct file __rcu **fd;      /* current fd array */
	unsigned long *close_on_exec;
	unsigned long *open_fds;
	unsigned long *full_fds_bits;
	struct rcu_head rcu;
};

struct file {
    struct path                     f_path;         // a dentry and a mount point which locate this file
    struct inode                    *f_inode;       // the inode underlying this file
    const struct file_operations    *f_op;          // callbacks to function which can operate on this file
    spinlock_t                      f_lock;
    atomic_long_t                   f_count;
    unsigned int                    f_flags;
    fmode_t                         f_mode;
    struct mutex                    f_pos_lock;
    loff_t                          f_pos           // offset in the file from which the next read or write shall commence
    struct fown_struct              f_owner;
    void                            *private_data
    struct address_space            *f_mapping;     // callbacks for memory mapping operations
}; 
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
   struct dentry {
    struct hlist_bl_node            d_hash;                     // lookup hash list
    struct dentry                   *d_parent;	                // parent directory
    struct qstr                     d_name;
    struct inode                    *d_inode;                   // where the name belongs to
    unsigned char                   d_iname[DNAME_INLINE_LEN];  // small names
    const struct dentry_operations  *d_op;
    void                            *d_fsdata;                  // fs-specific data
    struct list_head                d_child;                    // child of parent list, i.e., our siblings
    struct list_head                d_subdirs;                  // our children
};


struct inode {
     __u16 i_mode;     // 模式
     __u16 i_nlinks;   // 链接数
     __u16 i_uid;      // 所属用户UID
     __u16 i_gid;      // 所属组ID
     __u32 i_size;     // 文件大小
     __u32 i_atime;    // 访问时间
     __u32 i_mtime;    // 修改时间
     __u32 i_ctime;    // 创建时间
     __u32 i_zone[10]; // 文件数对应的数据块编号
};

disk vs sockets

  1. 建立映射部分是一样的实现方式

task–> private file table—> file;

  1. 实际的存储部分,也就是inode开始出现差异

controller

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20211118/yxZpdy.jpg

cpu无法直接跟硬件直接交流,必须通过控制器 cpu(or other controller:dma…)->controller(driver)-> device;

controller

  1. buffer data;
  2. excute command;

文件读写过程

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210928/3i70pG.jpg https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210928/W8s35D.jpg

读写都是基于pageCache;

1. read

KernelBuffer From OutSide; then userBuffer From KernelBuffer

  1. process 根据 descriptor table 找到对应innode;

  2. innode 存储对应 pageCache;

  3. pageCache != null –>直接读取 pageCache

  4. pageCache =null 页缺失异常,创建pageCache,读取disk数据到pageCache;

  5. copy pageCache to userCache

write(删改查)

UserBuffer To Kernel Buffer

  1. ..

  2. ..

  3. pageCache != null

  4. pageCache =null 页缺失异常,创建pageCache,读取disk数据到pageCache

  5. 更新pageCache,pageCache被标记为dirty page

———–write 返回;

此时如果断电,则刚写入会丢失!;

  1. 写入disk;
    1. 定期写会disk
    2. 手动调用 sync() or fsync()写入磁盘

write code

the process: open, write , fsync

code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20

#include <stdio.h>
#include <fcntl.h>

int main()
{
  char my_write_str[] = "1234567890";
  char my_read_str[100];
  char my_filename[] = "snazzyjazz.txt";
  int my_file_descriptor, close_err;

  /* Open the file.  Clobber it if it exists. */
  my_file_descriptor = open (my_filename, O_RDWR | O_CREAT | O_TRUNC);

  /* Write 10 bytes of data and make sure it's written */
  write (my_file_descriptor, (void *) my_write_str, 10);
  fsync (my_file_descriptor);
  close(my_file_descriptor);

}

how to lookup block

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210709/xXPb6p.jpg

data array index and composited array index

data[1~7] —> block; data[8][1-n]—> n block; data[8][1-n][1-n]—> n^2 block

how to add a new disk to current host

1
2
3
4
5
6
7
8
9
// 1.  create partition
 fdisk -l
 fdisk /dev/vdb

// 2. create a specific filSystem
 mkfs -t ext4 /dev/vdb1

// 3. mount (attach disk filesystem to linux file system)
  mount /dev/vdb1 /data

create a config for permanent mount at boot time

1
2
/etc/fstab
/dev/xvdc1 /data ext4 defaults     0   0

command

  1. fdisk: manipulate partition table
    1. fdisk -l: list the partition for all disk
  2. df: report current file system usage

读写单位

sector(扇区):磁盘读写最小单位; block–N个sector组成: 文件读写最小单位(内核层面;内核操作磁盘单位);

pageCache: 文件读写最小单位(用户层面;用户读写文件);

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210927/fxm4Z4.jpg

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210927/WRO02q.jpg

hard disk i/o

https://raw.githubusercontent.com/atony2099/imgs/master/uPic/vQpTiN.jpg

https://raw.githubusercontent.com/atony2099/imgs/master/uPic/GkXCXm.jpg

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210927/1JEAN7.jpg

存储容量=磁头数磁道(柱面)数每道扇区数*每扇区字节数

1. 过程

  1. 寻道: Tseek是指将读写磁头移动至正确的磁道上所需要的时间。
  2. 旋转延迟: Trotation是指盘片旋转将请求数据所在的扇区移动到读写磁头下方所需要的时间
  3. 传输: Ttransfer是指完成传输所请求的数据所需要的时间

IOPS(input output per second) = 1000ms / (Tseek + Trotation + Transfer)

i/o优化;

  1. 顺序写入

顺序写入(追加写);

新增数据的时候直接写在现有数据地址后面;

  1. case: wal, kafka log

  2. 非顺序写入消耗:

    1. 定位地址;
    2. 磁盘消耗: 寻道+旋转
  3. btree是吗 不是,新增数据需要写入某个特定的page上

  4. crons: 读比较麻烦;

DMA VS not DMA

1. not DMA;

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20211118/rwVLiS.jpg

cpu copy data from disk controller to kernel buffer;

  1. controller register-> cpu register,
  2. cpu regsiter to kernel buffer

2. DMA

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210928/GtLvNV.jpg

dma controller copy data from disk controller

zero copy

what: reduce data copy times

for: improve speed of data transport

complete copy:

1
2
buffer = File.read 
Socket.send(buffer)

https://cdn.jsdelivr.net/gh/atony2099/imgs@master/20210928/4U8Cjp.jpg

how

mmmp

1
2
buf = mmap(file, len);
write(sockfd, buf, len);

https://raw.githubusercontent.com/atony2099/imgs/master/uPic/5cQ1Hv.jpg

sendfile

1
2
#include <sys/socket.h>
ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count);

https://raw.githubusercontent.com/atony2099/imgs/master/uPic/50Xekq.jpg

file share

  1. nfs:a distributed file system protocol that allows you to share remote directories over a network.
    1
    
    mount -t nfs 10.10.0.10:/backups /var/backups
    
  2. smb:The Server Message Block protocol (
  3. ftp
  4. webdav

文件层次

  1. disk ; 通常代表一个真正的物理磁盘
  2. 分区; 对磁盘进行逻辑分区;划分分区通常是用来存储不同类型文件:系统,个人; 通过挂载集成到linux 目录;
  3. 文件系统: 每个分区使用一种文件系统管理 :如何管理和存储数据; ext4,xfs;

mount: 将分区上的文件系统关联到 某个目录下;