PG在还没有pg_class的时候怎么访问基础系统表？

在没有pg_class的时候，数据库怎么访问系统表？这个问题可以分成两个阶段来看：

数据库簇初始化，此时一个database都没有，所以怎么构造和访问pg_class等系统表是一个问题
私有内存初始化系统表。PG的系统表信息是放在backend本地进程上的，backend在初始化的时候又怎么load pg_class？

初始化数据字典
#

在数据库还没有初始化的时候，明显是不能通过访问数据字典来初始化database、pg_class等等对象的，因为一个库都没有就不能create database，也没有pg_class去查元数据信息。 PG通过bki文件的特殊语言初始化一些数据结构，然后在bootstrap模式初始化一个原始database¹。

编译阶段：genbki.h & genbki.pl
#

src/include/catalog/genbki.h：

 * genbki.h defines CATALOG(), BKI_BOOTSTRAP and related macros
 * so that the catalog header files can be read by the C compiler.
 * (These same words are recognized by genbki.pl to build the BKI
 * bootstrap file from these header files.)

genbki.h内容很少，主要是为了catalog相关操作的宏定义，以及给KBI bootstrap文件的宏定义。数据字典的头文件基本都包含genbki.h genbki.pl会在编译过程读取/src/include/catalog目录下的.h表定义文件（不含pg_*_d.h），并创建postgres.bki文件和pg_*_d.h头文件。以pg_class为例：

[postgres@catalog]$ ll |grep pg_class 
-rw-r----- 1 postgres postgres   3682 Aug  6  2019 pg_class.dat
lrwxrwxrwx 1 postgres postgres     86 Apr  8 20:31 pg_class_d.h -> /lzl/soft/postgresql-11.5/src/backend/catalog/pg_class_d.h
-rw-r----- 1 postgres postgres   5219 Aug  6  2019 pg_class.h

pg_*_d.h头文件就是genbki.pl生成的。pg_*_d.h文件中都包含下面的一段话：

It has been GENERATED by src/backend/catalog/genbki.pl

每个数据字典都有一个结构体typedef struct FormData_*catalogname*用以存储数据字典的行数据²，例如pg_class的FormData_pg_class：

CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,RelationRelation_Rowtype_Id) BKI_SCHEMA_MACRO
{
	/* oid */
	Oid			oid;

	/* class name */
	NameData	relname;

	/* OID of namespace containing this class */
	Oid			relnamespace BKI_DEFAULT(pg_catalog) BKI_LOOKUP(pg_namespace);

	/* OID of entry in pg_type for relation's implicit row type, if any */
	Oid			reltype BKI_LOOKUP_OPT(pg_type);

	/* OID of entry in pg_type for underlying composite type, if any */
	Oid			reloftype BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_type);

	/* class owner */
	Oid			relowner BKI_DEFAULT(POSTGRES) BKI_LOOKUP(pg_authid);

...

	/* access-method-specific options */
	text		reloptions[1] BKI_DEFAULT(_null_);

	/* partition bound node tree */
	pg_node_tree relpartbound BKI_DEFAULT(_null_);
#endif
} FormData_pg_class;

pg_class的OID写死了1259，所有字段都在FormData_pg_class结构体中。用户存储数据的结构体初始化后，会使用对应的.dat文件插入基础数据。pg_class中会插入4条数据，可以理解为bootstrap item（pg15中的数据字典表有49个）：

{ oid => '1247',
  relname => 'pg_type', reltype => 'pg_type' },
{ oid => '1249',
  relname => 'pg_attribute', reltype => 'pg_attribute' },
{ oid => '1255',
  relname => 'pg_proc', reltype => 'pg_proc' },
{ oid => '1259',
  relname => 'pg_class', reltype => 'pg_class' },

postgres=#  select oid,relname from  pg_class where oid::int >=1247 and oid::int<=1259;
 oid  |   relname    
------+--------------
 1247 | pg_type
 1249 | pg_attribute
 1255 | pg_proc
 1259 | pg_class

把基础数据字典写入后，其他的都可以依赖这些数据生成。

初始化database阶段：initdb&postgres.bki
#

initdb.c中的注释：

 * To create template1, we run the postgres (backend) program in bootstrap
 * mode and feed it data from the postgres.bki library file.  After this
 * initial bootstrap phase, some additional stuff is created by normal
 * SQL commands fed to a standalone backend. 

以bootstrap模式启动backend并运行postgres.bki脚本，postgres.bki可以在没有任何系统表的情况下，执行相关函数。此后才可以使用正常的SQL文件和启动标准的backend进程。 template1可以称之为bootstrap database了，postgres、template0两个库是在template1建立以后才创建：

void
initialize_data_directory(void)
{
...
	/* Bootstrap template1 */
	bootstrap_template1();
...
	make_template0(cmdfd);

	make_postgres(cmdfd);

	PG_CMD_CLOSE;

	check_ok();
}

有了template1后，make_template0和make_postgres创建对应的template0 database和postgres database，直接用一般的SQL语句CREATE DATABASE命令创建：

/*
 * copy template1 to postgres
 */
static void
make_postgres(FILE *cmdfd)
{
	const char *const *line;

	/*
	 * Just as we did for template0, and for the same reasons, assign a fixed
	 * OID to postgres and select the file_copy strategy.
	 */
	static const char *const postgres_setup[] = {
		"CREATE DATABASE postgres OID = " CppAsString2(PostgresDbOid)
		" STRATEGY = file_copy;\n\n",
		"COMMENT ON DATABASE postgres IS 'default administrative connection database';\n\n",
		NULL
	};

	for (line = postgres_setup; *line; line++)
		PG_CMD_PUTS(*line);
}

backend本地缓存数据字典
#

PG私有内存的基础知识可参考PostgreSQL内存浅析³。

PG的数据字典信息存放在本地backend进程中，非共享。数据字典缓存主要关注的是syscache/catcache和relcache，他们分别缓存系统表和表模式信息。其中syscache/catcache是用于缓存系统表的，syscache相当于catcache的上层结构。syscache是一个数组，数字中的每个元素对应一个catcache，每个catcache对应一个系统表¹。

//PG15.3 SysCacheSize=35
static CatCache *SysCache[SysCacheSize];

pg在fork backend的时候调用的是InitPostgres，其中会调用syscache/catcache和relcache的初始化函数。下面来看看backend的初始化。

syscache/catcache初始化
#

struct cachedesc
{
	Oid			reloid;			/* OID of the relation being cached */
	Oid			indoid;			/* OID of index relation for this cache */
	int			nkeys;			/* # of keys needed for cache lookup */
	int			key[4];			/* attribute numbers of key attrs */
	int			nbuckets;		/* number of hash buckets for this cache */
};

static const struct cachedesc cacheinfo[] = {
	{
...	
	{RelationRelationId,		/* RELNAMENSP */
		ClassNameNspIndexId,
		2,
		{
			Anum_pg_class_relname,
			Anum_pg_class_relnamespace,
			0,
			0
		},
		128
	},
	{RelationRelationId,		/* RELOID */
		ClassOidIndexId,
		1,
		{
			Anum_pg_class_oid,
			0,
			0,
			0
		},
		128
...
};

例如pg_class，由genbki.pl生成的pg_class_d.h中定义Anum_pg_class_oid

#define Anum_pg_class_oid 1

reloid就是oid

  select oid,relname from  pg_class where oid::int >=1247 and oid::int<=1259;
 oid  |   relname    
------+--------------
 1259 | pg_class

InitCatalogCache其实是初始化syscache数组，也就是初始化所有的catcache。InitCatalogCache最终通过InitCatCache全量初始化CatCache（这里其中一个就有pg_class的)：

void
InitCatalogCache(void)
{
...
	for (cacheId = 0; cacheId < SysCacheSize; cacheId++)
	{
		SysCache[cacheId] = InitCatCache(cacheId,
										 cacheinfo[cacheId].reloid,
										 cacheinfo[cacheId].indoid,
										 cacheinfo[cacheId].nkeys,
										 cacheinfo[cacheId].key,
										 cacheinfo[cacheId].nbuckets);
		if (!PointerIsValid(SysCache[cacheId]))
			elog(ERROR, "could not initialize cache %u (%d)",
				 cacheinfo[cacheId].reloid, cacheId);
		/* Accumulate data for OID lists, too */
		SysCacheRelationOid[SysCacheRelationOidSize++] =
			cacheinfo[cacheId].reloid;
		SysCacheSupportingRelOid[SysCacheSupportingRelOidSize++] =
			cacheinfo[cacheId].reloid;
		SysCacheSupportingRelOid[SysCacheSupportingRelOidSize++] =
			cacheinfo[cacheId].indoid;
		/* see comments for RelationInvalidatesSnapshotsOnly */
		Assert(!RelationInvalidatesSnapshotsOnly(cacheinfo[cacheId].reloid));
	}
...
	CacheInitialized = true;
}

然后来到catcache.c。 InitCatCache会开辟内存，并且放到CacheMemoryContext中管理。它也只是把宏定义的一些oid赋值给对应的catcache，此时还没有open表：

/*
 *		InitCatCache
 *
 *	This allocates and initializes a cache for a system catalog relation.
 *	Actually, the cache is only partially initialized to avoid opening the
 *	relation.  The relation will be opened and the rest of the cache
 *	structure initialized on the first access.
 */
CatCache *
InitCatCache(int id,
			 Oid reloid,
			 Oid indexoid,
			 int nkeys,
			 const int *key,
			 int nbuckets)
{
...

	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
...
	sz = sizeof(CatCache) + PG_CACHE_LINE_SIZE;
	cp = (CatCache *) CACHELINEALIGN(palloc0(sz));
	cp->cc_bucket = palloc0(nbuckets * sizeof(dlist_head));

	/*
	 * initialize the cache's relation information for the relation
	 * corresponding to this cache, and initialize some of the new cache's
	 * other internal fields.  But don't open the relation yet.
	 */
	cp->id = id;
	cp->cc_relname = "(not known yet)";
	cp->cc_reloid = reloid;
	cp->cc_indexoid = indexoid;
	cp->cc_relisshared = false; /* temporary */
	cp->cc_tupdesc = (TupleDesc) NULL;
	cp->cc_ntup = 0;
	cp->cc_nbuckets = nbuckets;
	cp->cc_nkeys = nkeys;
	for (i = 0; i < nkeys; ++i)
		cp->cc_keyno[i] = key[i];
...
	MemoryContextSwitchTo(oldcxt);

	return cp;
}

id是catcache数组元素的编号，赋值的reloid是已知的cacheinfo中的oid，也赋值了cacheinfo中的key[4]，其他信息基本都还不知道，例如relname、tupdesc，因为到这里系统表还没有open。 catcache只有在search的时候才有open的操作，虽然函数名字类似*init*，不过已经不在初始化的过程中了，相关函数不再这里展示。 syscache/catcache初始化完成后，实际上是没有任何元组信息的。

relcache初始化
#

relcache初始化这篇PostgreSQL内存浅析已经讲的比较好了。 relcache初始化由5个阶段:

RelationCacheInitialize - 初始化relcache，初始化为空的
RelationCacheInitializePhase2 - 初始化共享的catalog，并加载5个global系统表
RelationCacheInitializePhase3 - 完成初始化relcache，并加载4个基础系统表
RelationIdGetRelation - 通过relation id获得rel描述
RelationClose - 关闭一个relation

其中RelationCacheInitializePhase2 RelationCacheInitializePhase3 都有load系统表，他们有先后顺序的必要。 RelationCacheInitializePhase2有兴趣的可以自行查看函数，也load几个系统表；RelationCacheInitializePhase3 是与我们的问题相关的，我们看这个：

/*
 *		RelationCacheInitializePhase3
 *
 *		This is called as soon as the catcache and transaction system
 *		are functional and we have determined MyDatabaseId.  At this point
 *		we can actually read data from the database's system catalogs.
 *		We first try to read pre-computed relcache entries from the local
 *		relcache init file.  If that's missing or broken, make phony entries
 *		for the minimum set of nailed-in-cache relations.  Then (unless
 *		bootstrapping) make sure we have entries for the critical system
 *		indexes.  Once we've done all this, we have enough infrastructure to
 *		open any system catalog or use any catcache.  The last step is to
 *		rewrite the cache files if needed.
 */
void
RelationCacheInitializePhase3(void)
{
...
	if (IsBootstrapProcessingMode() ||
		!load_relcache_init_file(false))
	{
		needNewCacheFile = true;

		formrdesc("pg_class", RelationRelation_Rowtype_Id, false,
				  Natts_pg_class, Desc_pg_class);
		formrdesc("pg_attribute", AttributeRelation_Rowtype_Id, false,
				  Natts_pg_attribute, Desc_pg_attribute);
		formrdesc("pg_proc", ProcedureRelation_Rowtype_Id, false,
				  Natts_pg_proc, Desc_pg_proc);
		formrdesc("pg_type", TypeRelation_Rowtype_Id, false,
				  Natts_pg_type, Desc_pg_type);

#define NUM_CRITICAL_LOCAL_RELS 4	/* fix if you change list above */
	}

	MemoryContextSwitchTo(oldcxt);

	/* In bootstrap mode, the faked-up formrdesc info is all we'll have */
	if (IsBootstrapProcessingMode())
		return;

...
		/* now write the files */
		write_relcache_init_file(true);
		write_relcache_init_file(false);
	}
}

IsBootstrapProcessingMode其实是专门为bootstrap模式定制的判断，一般的backend是不满足这个条件的。 load_relcache_init_file(false)尝试从initfile中加载系统表信息，load_relcache_init_file(false)传入的是false表示是私有initfile，不是共享initfile：

[postgres@16384]$ pwd
/pgdata/lzl/data15_6879/base/16384
--粗糙一点看。strings会忽略一部分信息，但是表和列名可以看到
[postgres@16384]$ strings pg_internal.init |grep pg_class
pg_class_oid_index
pg_class
pg_class_relname_nsp_index
[postgres@16384]$ strings pg_internal.init |grep -E "pg_class|relname"
pg_class_oid_index
pg_class
relname
relnamespace
pg_class_relname_nsp_index
relname
relnamespace

如果initfile损坏或者没有，那么加载initfile失败进入判断，去load 4个基础系统表：

	//跟2阶段差不多，加载更多的系统表描述
	if (IsBootstrapProcessingMode() ||
		!load_relcache_init_file(false))
	{
		needNewCacheFile = true;

		formrdesc("pg_class", RelationRelation_Rowtype_Id, false,
				  Natts_pg_class, Desc_pg_class);
		formrdesc("pg_attribute", AttributeRelation_Rowtype_Id, false,
				  Natts_pg_attribute, Desc_pg_attribute);
		formrdesc("pg_proc", ProcedureRelation_Rowtype_Id, false,
				  Natts_pg_proc, Desc_pg_proc);
		formrdesc("pg_type", TypeRelation_Rowtype_Id, false,
				  Natts_pg_type, Desc_pg_type);

有了pg_class 4个基础表，后面加载系统表信息一切都很简单了

References
#

《PostgreSQL内核分析》第2，3章 ↩︎ ↩︎
https://www.postgresql.org/docs/current/system-catalog-declarations.html ↩︎
PostgreSQL内存浅析 ↩︎

初始化数据字典#

编译阶段：genbki.h & genbki.pl#

初始化database阶段：initdb&postgres.bki#

backend本地缓存数据字典#

syscache/catcache初始化#

relcache初始化#

References#

相关文章