找到 lua 死循环代码

最近遇到 lua 死循环的情况，就研究下如何找到死循环所在的代码，方便定位和解决问题。

首先，要找到出问题的协程（lua_State）。lua vm 实现时，每个协程都有自己独立的堆栈（stack），以及函数栈（CallInfo）。

所以，要先找到死循环的协程，再从这个协程函数栈找到代码。

怎么找到这个协程？

有三种办法：
方法1、在死循环会执行到的 opcode ，加代码取得当前协程；
方法2、遍历所有 gc 数据对象，找到当前正在执行的协程；
方法3、重写 coroutine.resume 及 wrap 函数，取得当前协程。

方法2 找到 lua 协程

先说第 2 种方法，因为我一开始就是这么干的，能快速找到，不用修改 lua vm 代码，但弊端也有，需要处理多线程问题。

代码以5.3.4为例，其他可能要修改

global_State *g = G(L);
GCObject *p;
// 遍历所有的数据对象
for (p = g->allgc; p != NULL; p = p->next) {
	// 找到所有 lua state 对象， LUA_TTHREAD 定义 8
	if (p->tt == 8 ) {
		// 存活对象， 由 !isdead(g,v) 展开
		if ((p->marked ^ 3) & (g->currentwhite ^ 3)) {
			// 取得 lua state， 由 gco2th(p) 展开
			lua_State *thread = &((union GCUnion *)p)->th;
			// lua state 正在运行， LUA_OK 定义 0
			if (thread->status == 0 && thread->ci != &thread->base_ci) {
				// 找到当前正在运行的协程
			}
		}
	}
}

以上代码，在多线程下，你的线程正在遍历 gc 数据对象，而调度 lua vm 的线程执行 gc ，就会出问题，数据对象可能被释放掉了。

如果确定要用这个方法，可以利用 lua 预留的锁函数

// llimits.h

/*
** macros that are executed whenever program enters the Lua core
** ('lua_lock') and leaves the core ('lua_unlock')
*/
#if !defined(lua_lock)
#define lua_lock(L)	((void) 0)
#define lua_unlock(L)	((void) 0)
#endif

如下使用互斥锁，具体实现就不赘述了。

#define lua_lock(L) pthread_mutex_lock(&(_G(L)->lock));
#define lua_unlock(L) pthread_mutex_unlock(&(_G(L)->lock));

方法1 找到 lua 协程

在 skynet 的代码中， lua vm 处理 opcode 时，对于一些特定 opcode，加代码标记当前 lua_State，涉及的 opcode 有 OP_JMP、 OP_FORLOOP、OP_TFORLOOP、OP_TAILCALL、OP_CALL

// lvm.c

vmcase(OP_JMP) {
  lua_checksig(L);  // skynet add
  dojump(ci, i, 0);
  vmbreak;
}

// skynet add
lua_State * skynet_sig_L = NULL;
LUA_API void
lua_checksig_(lua_State *L) {
  // 实现退出死循环
  if (skynet_sig_L == G(L)->mainthread) {
    skynet_sig_L = NULL;
    lua_pushnil(L);
    lua_error(L);
  }
}

// lua.h

// skynet add
LUA_API lua_State * skynet_sig_L;
LUA_API void (lua_checksig_)(lua_State *L);
#define lua_checksig(L) if (skynet_sig_L) { lua_checksig_(L); }

所以，出现死循环时，不需要知道哪个协程。只要标记 lua vm 的主协程（mainthread），主协程在 lua vm 初始化后不会改变。标记在 skynet_sig_L

这样，lua vm 处理 opcode 处理时，判定当前主协程为标记的协程，当前协程很大可能就是死循环的协程。

方法3 找到 lua 协程

在 skynet 新的代码中，通过重写 global 表的 coroutine.resume 、 coroutine.wrap 函数，取得当前协程

// service_snlua.c

static int
init_cb(struct snlua *l, skynet_context *ctx, const char * args, size_t sz) {
	lua_State *L = l->L;
	//...
	luaL_openlibs(L);
	luaL_requiref(L, "skynet.profile", init_profile, 0);

	int profile_lib = lua_gettop(L);
	// replace coroutine.resume / coroutine.wrap
	lua_getglobal(L, "coroutine");
	lua_getfield(L, profile_lib, "resume");
	lua_setfield(L, -2, "resume");
	lua_getfield(L, profile_lib, "wrap");
	lua_setfield(L, -2, "wrap");

	//...
}

static int
init_profile(lua_State *L) {
	luaL_Reg l[] = {
		{ "start", lstart },
		{ "stop", lstop },
		{ "resume", luaB_coresume },
		{ "wrap", luaB_cowrap },
		{ NULL, NULL },
	};
	luaL_newlibtable(L,l);
	// ...
}


static int luaB_coresume (lua_State *L) {
	int r = timing_resume(L, 1, lua_gettop(L) - 1);
	// ...
}

static int
timing_resume(lua_State *L, int co_index, int n) {
	lua_State *co = lua_tothread(L, co_index);
	// ...
	int r = auxresume(L, co, n);
	// ...
	return r;
}

static int auxresume (lua_State *L, lua_State *co, int narg) {
	int status, nres;
  	lua_xmove(L, co, narg);
	status = lua_resumeX(co, L, narg, &nres);
	// ...
}


static int
lua_resumeX(lua_State *L, lua_State *from, int nargs, int *nresults) {
	void *ud = NULL;
	lua_getallocf(L, &ud);
	struct snlua *l = (struct snlua *)ud;
	switchL(L, l);
	int err = lua_resume(L, from, nargs, nresults);
	// ...
	switchL(from, l);
	return err;
}

static void
switchL(lua_State *L, struct snlua *l) {
	l->activeL = L;
	// ...
}

以上，l->activeL 为当前执行的协程。

找到当前正在运行的代码

遍历当前协程的函数栈，一直回溯，打印所有函数名和行数。


CallInfo *ci;
// 遍历函数栈
for (ci = thread->ci; ci != NULL && ci != &thread->base_ci; ci = ci->previous) {
	// 如果是 lua function,  由 ttype(ci->func) == LUA_TLCL 展开
	if (ci->func->tt_ & 0x3F == 6) {
		// 找到代码，由 clLvalue(ci->func) ->p 展开
		Proto *sp = ((union GCUnion *)(ci->func->value_.gc))->cl.l.p;
		// 找到函数名，由 getstr(p->source) 展开
		char * filename = sp->source ? (char *) ((char *)(sp->source) + 
			sizeof(UTString)) : "unknown";
		printf("LUA FUNCTION : %s %d %d", filename, sp->linedefined, 
			sp->lastlinedefined);    
	}
}

死循环检查

如何发现 lua 代码出现了死循环，说下 skynet 的思路。

// skynet_server.c

// 每次处理消息，monitor 模块记录当前要调度的 skynet 服务 id
skynet_monitor_trigger(sm, msg.source , handle);

if (ctx->cb == NULL) {
	skynet_free(msg.data);
} else {
	dispatch_message(ctx, &msg);
}
// 消息处理完，monitor 模块取消记录服务 id
skynet_monitor_trigger(sm, 0,0);

看下 monitor 模块的处理。

// skynet_monitor.c

void
skynet_monitor_trigger(struct skynet_monitor *sm, uint32_t source, uint32_t destination) {
	sm->source = source;
	sm->destination = destination;
	// 版本号自增
	ATOM_INC(&sm->version); 
}

然后，再起个线程定时检查版本号，如果版本号不变，则判定 lua 代码可能死循环

// skynet_monitor.c

void
skynet_monitor_check(struct skynet_monitor *sm) {
	// 如果版本号和上次检查结果一样，则 lua 可能死循环
	if (sm->version == sm->check_version) {
		if (sm->destination) {
			skynet_context_endless(sm->destination);
			skynet_error(NULL, "A message from [ :%08x ] to [ :%08x ] maybe in an endless loop (version = %d)", 
				sm->source , sm->destination, sm->version);
		}
	} else {
		sm->check_version = sm->version;
	}
}

其他项目也可以参照 skynet 的思路，调用 lua 态时，调用一次自增一次版本号，再定时检查下版本号是否没改变。

结束语

方法2 要遍历所有的数据对象，在 vm 数据对象过多时，开销比较大，但这种方法也有优点，不改变 lua 原生代码的处理，没有调用时对 vm 只有互斥锁的开销，没有临界资源竞争，锁的开销就非常小。再者，大多数项目中，死循环出现的可能性也比较低。

最后，欢迎评论！