Skip to content

feat(WIP): python parser #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7e80be5
fix: fix hardcode in export.go
Hoblovski Apr 30, 2025
ddca707
feat: cxx/c parser skeleton with clangd-18
Hoblovski Apr 30, 2025
5ba1764
feat: initial support for cxx collect
Hoblovski Apr 30, 2025
83847e7
fix: get json.Types and build graph
Hoblovski Apr 30, 2025
0ef54ac
fix: fix comments
Hoblovski May 9, 2025
fe8fb67
test: duplicate c functions
Hoblovski May 9, 2025
8d868e8
feat: python parser skeleton with custom pylsp
Hoblovski May 14, 2025
92dbe3b
feat: add tests for python parser
Hoblovski May 14, 2025
9ecefb5
XXX: temporarily disable unused implementation check
Hoblovski May 14, 2025
a541e5a
refactor: go fmt
Hoblovski May 15, 2025
3e88127
feat: initial support for python parse
Hoblovski May 15, 2025
7a7cd45
fix: entity types. fix calls/types for funcs
Hoblovski May 15, 2025
5c371ad
fix: manualy parse to get correct Params
Hoblovski May 15, 2025
2c3f317
fix: uniast.file.Types now exclude imported types (and funcs too)
Hoblovski May 15, 2025
ab3fab4
feat: test for python parser method dependency
Hoblovski May 15, 2025
2a52a5e
feat: more tests for python parser
Hoblovski May 15, 2025
bcc7c36
fix: use most specific sym if multiple are possible in filterEntitySy…
Hoblovski May 15, 2025
4647d89
fix: typo
Hoblovski May 15, 2025
ca86757
feat: simple object test for rust
Hoblovski May 15, 2025
94b3445
feat: impl support and export __xx__ for python
Hoblovski May 15, 2025
537a2ed
fix: copyright headers
Hoblovski May 16, 2025
15c667d
feat: progress tracking for collect
Hoblovski May 16, 2025
709eb48
feat: cache semantic tokens with `-cache`
Hoblovski May 16, 2025
a471a24
feat: -veryverbose and -verbose for debug/info logs
Hoblovski May 16, 2025
24b2188
feat: parses flask
Hoblovski May 16, 2025
b3d6331
feat: ignore pystd if !NeedStdSymbols
Hoblovski May 20, 2025
bb35fec
feat: arg -indent to indent output json
Hoblovski May 20, 2025
ed6da83
feat: separate builtins from site-packages
Hoblovski May 20, 2025
fcb9b3a
test: global var in python
Hoblovski Jun 10, 2025
eb5d94e
fix(lsp): cache definition
Hoblovski Jun 10, 2025
795b36b
feat: optimized progress output
Hoblovski Jun 10, 2025
2d56a14
fix: cache countLines during export
Hoblovski Jun 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ src/lang/testdata

tools
abcoder

/*.txt
/*.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ ABCoder currently supports the following languages:
| Go | ✅ | ✅ |
| Rust | ✅ | Coming Soon |
| C | Coming Soon | ❌ |
| Python | Coming Soon | ❌ |



Expand Down
113 changes: 92 additions & 21 deletions lang/collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ import (
"strings"
"unicode"

"github.com/cloudwego/abcoder/lang/cxx"
"github.com/cloudwego/abcoder/lang/log"
. "github.com/cloudwego/abcoder/lang/lsp"
"github.com/cloudwego/abcoder/lang/python"
"github.com/cloudwego/abcoder/lang/rust"
"github.com/cloudwego/abcoder/lang/uniast"
)
Expand All @@ -35,8 +37,13 @@ type CollectOption struct {
NoNeedComment bool
NeedTest bool
Excludes []string
CacheResults bool
}

const (
SUPRESS_COLLECT_ERRORS = true
)

type Collector struct {
cli *LSPClient
spec LanguageSpec
Expand Down Expand Up @@ -79,6 +86,10 @@ func switchSpec(l uniast.Language) LanguageSpec {
switch l {
case uniast.Rust:
return &rust.RustSpec{}
case uniast.Cxx:
return &cxx.CxxSpec{}
case uniast.Python:
return &python.PythonSpec{}
default:
panic(fmt.Sprintf("unsupported language %s", l))
}
Expand All @@ -101,6 +112,24 @@ func NewCollector(repo string, cli *LSPClient) *Collector {
}

func (c *Collector) Collect(ctx context.Context) error {
// Example code to configure the LSP client
if !c.NeedStdSymbol {
if c.Language == uniast.Python {
conf := map[string]interface{}{
"settings": map[string]interface{}{
"pylsp": map[string]interface{}{
"plugins": map[string]interface{}{
"jedi_definition": map[string]interface{}{
"follow_builtin_definitions": false,
},
},
},
},
}
c.cli.Notify(ctx, "workspace/didChangeConfiguration", conf)
}
}

excludes := make([]string, len(c.Excludes))
for i, e := range c.Excludes {
if !filepath.IsAbs(e) {
Expand All @@ -111,8 +140,8 @@ func (c *Collector) Collect(ctx context.Context) error {
}

// scan all files
roots := make([]*DocumentSymbol, 0, 1024)
scanner := func(path string, info os.FileInfo, err error) error {
collect_paths := make([]string, 0, 1024)
if err := filepath.Walk(c.repo, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
Expand All @@ -127,20 +156,35 @@ func (c *Collector) Collect(ctx context.Context) error {
if c.spec.ShouldSkip(path) {
return nil
}
collect_paths = append(collect_paths, path)
return nil
}); err != nil {
return err
}

// collect symbols
// collect symbols
roots := make([]*DocumentSymbol, 0, 1024)
for i, path := range collect_paths {
uri := NewURI(path)
symbols, err := c.cli.DocumentSymbols(ctx, uri)
if err != nil {
return err
}
log.Info("collecting %d/%d files %s, has %d symbols\n", i, len(collect_paths), path, len(symbols))
// file := filepath.Base(path)
n_sym := 0
for _, sym := range symbols {
log.Debug(" collecting symbol %d/%d %s\n", n_sym, len(symbols), sym.Name)
n_sym++
// collect content
content, err := c.cli.Locate(sym.Location)
if err != nil {
return err
}
// HACK: skip imported symbols
if c.Language == uniast.Python && (strings.HasPrefix(content, "from ") || strings.HasPrefix(content, "import ")) {
continue
}
// collect tokens
tokens, err := c.cli.SemanticTokens(ctx, sym.Location)
if err != nil {
Expand All @@ -151,12 +195,8 @@ func (c *Collector) Collect(ctx context.Context) error {
c.syms[sym.Location] = sym
roots = append(roots, sym)
}

return nil
}
if err := filepath.Walk(c.repo, scanner); err != nil {
return err
}
log.Info("collected %d root symbols. going to collect more syms and dependencies...\n", len(roots))

// collect some extra metadata
syms := make([]*DocumentSymbol, 0, len(roots))
Expand All @@ -167,6 +207,7 @@ func (c *Collector) Collect(ctx context.Context) error {
}
c.processSymbol(ctx, sym, 1)
}
log.Info("collected %d symbols. going to collect dependencies...\n", len(c.syms))

// collect internal references
// for _, sym := range syms {
Expand Down Expand Up @@ -200,8 +241,11 @@ func (c *Collector) Collect(ctx context.Context) error {
// }
// }

num_edges := 0
// collect dependencies
for _, sym := range syms {
for i, sym := range syms {
log.Info("collecting dependencies %d/%d %s\n", i, len(syms), sym.Name)

next_token:

for i, token := range sym.Tokens {
Expand Down Expand Up @@ -247,7 +291,9 @@ func (c *Collector) Collect(ctx context.Context) error {
// go to definition
dep, err := c.getSymbolByToken(ctx, token)
if err != nil || dep == nil {
log.Error("dep token %v not found: %v\n", token, err)
if !SUPRESS_COLLECT_ERRORS {
log.Error("dep token %v not found: %v\n", token, err)
}
continue
}

Expand All @@ -268,6 +314,8 @@ func (c *Collector) Collect(ctx context.Context) error {
c.syms[dep.Location] = dep
}

log.Debug(" Collect: dep %s -> %s (file: %s -> %s)\n", sym.Name, dep.Name, sym.Location, token.Location)
num_edges++
c.deps[sym] = append(c.deps[sym], dependency{
Location: token.Location,
Symbol: dep,
Expand All @@ -276,6 +324,7 @@ func (c *Collector) Collect(ctx context.Context) error {
}
}

log.Info("collected %d symbols, %d edges.\n", len(c.syms), num_edges)
return nil
}

Expand All @@ -297,18 +346,31 @@ func (c *Collector) getSymbolByTokenWithLimit(ctx context.Context, tok Token, de
return nil, fmt.Errorf("definition of token %s not found", tok)
}
if len(defs) > 1 {
log.Error("definition of token %s not unique", tok)
if !SUPRESS_COLLECT_ERRORS {
log.Error("definition of token %s not unique", tok)
}
}
return c.getSymbolByLocation(ctx, defs[0], depth, tok)
}

func (c *Collector) filterEntitySymbols(syms []*DocumentSymbol) *DocumentSymbol {
// Choose the most specific symbol
var mostSpecific *DocumentSymbol
mostSpecific = nil
for _, sym := range syms {
if c.spec.IsEntitySymbol(*sym) {
return sym
if !c.spec.IsEntitySymbol(*sym) {
continue
}
if mostSpecific == nil || mostSpecific.Location.Include(sym.Location) {
// replace most specific
mostSpecific = sym
} else if sym.Location.Include(mostSpecific.Location) {
// retain most specific
} else {
log.Error("multiple symbols %s and %s not include each other", mostSpecific, sym)
}
}
return nil
return mostSpecific
}

// return a language entity symbol
Expand Down Expand Up @@ -488,7 +550,9 @@ func (c *Collector) getDepsWithLimit(ctx context.Context, sym *DocumentSymbol, t
for _, tp := range tps {
dep, err := c.getSymbolByTokenWithLimit(ctx, sym.Tokens[tp], depth)
if err != nil || sym == nil {
log.Error_skip(1, "token %v not found its symbol: %v", tp, err)
if !SUPRESS_COLLECT_ERRORS {
log.Error_skip(1, "token %v not found its symbol: %v", tp, err)
}
} else {
d := dependency{sym.Tokens[tp].Location, dep}
tsyms[tp] = d
Expand Down Expand Up @@ -525,7 +589,7 @@ func (c *Collector) collectImpl(ctx context.Context, sym *DocumentSymbol, depth
impl = ChunkHead(sym.Text, sym.Location.Range.Start, sym.Tokens[fn].Location.Range.Start)
}
if impl == "" || len(impl) < len(sym.Name) {
impl = sym.Name
impl = fmt.Sprintf("class %s {\n", sym.Name)
}
// search all methods
for _, method := range c.syms {
Expand Down Expand Up @@ -581,12 +645,16 @@ func (c *Collector) processSymbol(ctx context.Context, sym *DocumentSymbol, dept
}
}
if i < 0 || i >= len(sym.Tokens) {
log.Error("get type token of variable symbol %s failed\n", sym)
if !SUPRESS_COLLECT_ERRORS {
log.Error("get type token of variable symbol %s failed\n", sym)
}
return
}
tsym, err := c.getSymbolByTokenWithLimit(ctx, sym.Tokens[i], depth-1)
if err != nil || tsym == nil {
log.Error("get type symbol for token %s failed:%v\n", sym.Tokens[i], err)
if !SUPRESS_COLLECT_ERRORS {
log.Error("get type symbol for token %s failed:%v\n", sym.Tokens[i], err)
}
return
}
c.vars[sym] = dependency{
Expand All @@ -613,9 +681,12 @@ func (c *Collector) updateFunctionInfo(sym *DocumentSymbol, tsyms, ipsyms, opsym
}
} else {
f = functionInfo{
TypeParams: tsyms,
Inputs: ipsyms,
Outputs: opsyms,
TypeParams: tsyms,
Inputs: ipsyms,
Outputs: opsyms,
InputsSorted: is,
OutputsSorted: os,
TypeParamsSorted: ts,
}
if rsym != nil {
if f.Method == nil {
Expand Down
Loading