Skip to content

Add Cpp Doc Generate tools #5900

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions ci_scripts/CAPItools/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# CAPI tools
CAPI tools 用于一键生成 C++ 的 rst 文档。

## 调用方式
```python
python main.py <source dir> <target dir>
```

若不设置`source dir`和`target dir`,则默认先查找已安装的`paddlepaddle`包环境。

其中:
- source dir 是安装后的 Paddle C++ API 声明路径。 例如`venv/Lib/site-packages/paddle/include/paddle`。
- target dir 目标文件保存路径。

最终生成结果如下所示:
```python
target dir
| -cn
|- index.rst
|- Paddle
|- fluid
|- phi
|- ...
| -en
|- index.rst
|- Paddle
|- fluid
|- phi
|- ...
```

## 获取最新 PaddlePaddle
pip install python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html

## 代码结构

### `main.py`文件主要用于处理和筛选包文件, 并调用`utils_helper.py`中的函数进行文件生成
```python
def analysis_file() # 用于解析文件内容(多线程不安全)

def generate_docs() # 用于创建目录并传值给 utils_helper.py 中的函数进行文件生成

def cpp2py() # 用于筛选出 cpp api 和 py api 相对应的函数名称
```

### `utils_helper.py`文件主要存放函数生成、解析, 以及文件写入的工作
```python

class func_helper(object) # 用于生成和解析方法
decode() # 用于解析输出输出参数、函数名称、返回值、函数注释信息
class class_helper(object) # 用于生成和解析类
decode() # 同 func_helper()

def generate_overview() # 用于生成 overview.rst 文件
```
150 changes: 150 additions & 0 deletions ci_scripts/CAPItools/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# python main.py [source dir] [target dir]
# python main.py ../paddle .


import CppHeaderParser
import json
import os
import traceback
import sys
import re

from utils_helper import func_helper, class_helper, generate_overview
from utils import get_PADDLE_API_class, get_PADDLE_API_func


# 解析所有的函数, 类, 枚举, 返回一个字典
# 多线程使用并不安全, 请不要使用多线程
def analysis_file(path):
header = CppHeaderParser.CppHeader(path, encoding='utf8')
data = json.loads(header.toJSON())
return data


# 生成文件
# 根据给定的list内容,生成对应的文档信息
def generate_docs(
all_funcs, all_class, cpp2py_api_list, save_dir, LANGUAGE="cn"
):
for item in all_funcs:
path = item["filename"].replace("../", "").replace(".h", "")
dir_path = os.path.join(save_dir, LANGUAGE, path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)

# 这个反斜杠需要单独处理, 在 linux 下
func_name = item["name"].replace("/", "")

# Note: 操作符仅不生成rst,实际上在Overview列表依然会呈现以提示存在此操作符
if func_name.startswith('operator'):
checkwords = func_name.replace('operator', '', 1)
if re.search(r"\w", checkwords) == None:
continue # 跳过操作符声明
rst_dir = os.path.join(save_dir, LANGUAGE, path, func_name + ".rst")
# avoid a filename such as operate*.rst, only windows
try:
helper = func_helper(item, cpp2py_api_list)
helper.create_and_write_file(rst_dir, LANGUAGE)
except:
print(traceback.format_exc())
print('FAULT GENERATE:' + rst_dir)

for item in all_class:
path = item["filename"].replace("../", "").replace(".h", "")
dir_path = os.path.join(save_dir, LANGUAGE, path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)

func_name = item["name"].replace("PADDLE_API", "")
rst_dir = os.path.join(save_dir, LANGUAGE, path, func_name + ".rst")
try:
helper = class_helper(item)
helper.create_and_write_file(rst_dir, LANGUAGE)
except:
print(traceback.format_exc())
print('FAULT GENERATE:' + rst_dir)


# cpp 对应 python api
# 用于存储 api 的名称, 用于后续生成对应python api文档链接
def cpp2py(data: dict):
cpp2py_api_list = []
for i in data["using"]:
cpp2py_api_list.append(i.replace("paddle::", ""))

return cpp2py_api_list


# 运行主函数,主要流程如下
# 1. 确定生成的目录
# 2. 提取待生成文档的PADDLE_API list
# 3. 生成文档
if __name__ == "__main__":
root_dir = ''
save_dir = '.' # 默认保存在当前目录
if len(sys.argv) == 3:
root_dir = sys.argv[1]
save_dir = sys.argv[2]

if root_dir == '':
try:
import paddle
import inspect

root_dir = os.path.dirname(inspect.getsourcefile(paddle))
except:
# for simple run
root_dir = '../paddle'
save_dir = '.' # 默认保存在当前目录

all_funcs = []
all_class = []
cpp2py_api_list = []
overview_list = []
for home, dirs, files in os.walk(root_dir):
for file_name in files:
# 跳过不需要处理的文件
if file_name.split(".")[-1] not in ["cc", "cu", "h"]:
continue

file_path = os.path.join(home, file_name)
# 处理 cpp 和 py api对应的文件, 目前只有这个文件内的 cpp api和 python api是对应的
if file_name == "tensor_compat.h":
cpp2py_data = analysis_file(file_path)
cpp2py_api_list = cpp2py(cpp2py_data).copy()

# 跳过文件中未包含PADDLE_API
with open(file_path, encoding='utf-8') as f:
if 'PADDLE_API ' not in f.read():
continue

print("Parsing: ", file_path)
data = analysis_file(file_path)

# 信息抽取
current_func = get_PADDLE_API_func(data)
current_class = get_PADDLE_API_class(data)

# 信息记录
all_funcs.extend(current_func)
all_class.extend(current_class)
overview_list.append(
{
'h_file': file_path,
'class': current_class,
'function': current_func,
}
)

# 生成文档
generate_docs(all_funcs, all_class, cpp2py_api_list, save_dir, "cn")
generate_docs(all_funcs, all_class, cpp2py_api_list, save_dir, "en")

# 生成 overview
generate_overview(overview_list, save_dir, "cn")
generate_overview(overview_list, save_dir, "en")

# 统计信息
print("PADDLE_API func count: ", len(all_funcs))
print("PADDLE_API class count: ", len(all_class))
print("cpp2py api count: ", len(cpp2py_api_list))
2 changes: 2 additions & 0 deletions ci_scripts/CAPItools/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
robotpy-cppheaderparser==5.1.0
# paddle
94 changes: 94 additions & 0 deletions ci_scripts/CAPItools/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# 获取存在 PADDLE_API func 数组的名称
# CppHeaderParser 解析后以字典形式保存数据,'debug' 字段中保存了原始信息
# 如果 PADDLE_API 在字段中,则表明该 API 是外部暴露的函数
def get_PADDLE_API_func(data: dict):
result = []
for i in data["functions"]:
if 'PADDLE_API' in i['debug']:
result.append(i)
return result


# 获取存在 PADDLE_API class 数组的名称
# CppHeaderParser 解析后以字典形式保存数据
# 如果 PADDLE_API 在字段中,则表明该 API 是外部暴露的类
def get_PADDLE_API_class(data: dict):
result = []
for classname in data["classes"]:
# Note 目前没有 PADDLE_API 是 struct 的
if data["classes"][classname]["declaration_method"] == "struct":
continue

# Note 这里需要处理一下, 因为类名和 PADDLE_API 会粘在一起, 例: PADDLE_APIDeviceContextPool
if "PADDLE_API" in classname:
result.append(data["classes"][classname])
return result


# 获取方法中的参数parameters
# 根据解析的参数字典,添加对应的参数名、参数类型、说明
# 有时候会将“&”解析为参数名,需要特殊处理
def get_parameters(parameters):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

提个小建议哈:每个函数上面的注释可以写的更详细些,对于降低后续的维护成本有很大帮助,尤其是对于这种解析文本类的函数,是不是可以在函数前面注释下,方便结合示例理解代码,比如:
解析前:***
解析后:***

# parameter_api = "" # 这里解析是给api使用的 (暂时不用)
parameter_dict = {}
for i in parameters:
parameter_type_tmp = i['type'].replace(" &", "").replace(" *", "")
# * 和 & 情况
# parameter_api += parameter_type_tmp

# 添加引用
parameter_type_tmp += "&" * i["reference"]
if i["pointer"] == 1:
# parameter_api += "*"
parameter_type_tmp += "*"
if i["constant"] == 1 and not parameter_type_tmp.startswith('const'):
parameter_type_tmp = "const " + parameter_type_tmp
# parameter_api += f" {i['name']}, "
desc = i.get('desc', '').replace(' ', '')

# special progress for none parameter name case
if i['name'] == '&':
continue
else:
parameter_dict[i['name']] = {
'type': parameter_type_tmp,
'intro': desc,
}
# parameter += f"\t- **{i['name']}** ({parameter_type_tmp}) - {desc}\n"
# 去掉末尾的逗号
# parameter_api = parameter_api[:-2]
# return parameter, parameter_api
return parameter_dict


# 将注释内容解析为说明字典
# 解析前: @brief Construct a Tensor from a buffer pointed to by `data` @note `from_blob` doesn’t copy or move data, Modifying the constructed tensor is equivalent to modifying the original data. @param data The pointer to the memory buffer. @param shape The dims of the tensor. @param dtype The data type of the tensor, should correspond to data type of`data`. See PD_FOR_EACH_DATA_TYPE in `phi/common/data_type.h` @param layout The data layout of the tensor. @param place The place where the tensor is located.If `place` is default value, it will be inferred from `data`,However, the feature is only supported on CPU or GPU.If `place` is not default value, make sure that `place` is equalto the place of `data` @param deleter A function or function object that will be called to free thememory buffer. @return A Tensor object constructed from the buffer
# 以@作为分隔符,索引关键字包括'brief'、'note'、'return'、'param'
# 解析后分别将对应关键字后的内容放入字典对应关键字后
def parse_doxygen(doxygen):
doxygen_dict = {
'intro': '',
'returns': '',
'param_intro': {},
'note': '',
}

if '@' in doxygen:
doxygen = doxygen[doxygen.find('@') :]
for doxygen_part in doxygen.split('@'):
if doxygen_part.startswith('brief '):
doxygen_dict['intro'] = doxygen_part.replace('brief ', '', 1)
elif doxygen_part.startswith('return '):
doxygen_dict['returns'] = doxygen_part.replace('return ', '', 1)
elif doxygen_part.startswith('param '):
param_intro = doxygen_part.replace('param ', '', 1)
param_name = param_intro[: param_intro.find(' ')]
doxygen_dict['param_intro'][param_name] = param_intro[
param_intro.find(' ') + 1 :
]
elif doxygen_part.startswith('note '):
doxygen_dict['note'] = doxygen_part.replace('note ', '', 1)
else:
pass

return doxygen_dict
Loading