Skip to content

Commit 78a3a2a

Browse files
authored
Refactor document parser to include more corner cases (#13175)
1 parent fb6c60d commit 78a3a2a

File tree

4 files changed

+107
-113
lines changed

4 files changed

+107
-113
lines changed

tools/diff-processor/detector/detector.go

+10-12
Original file line numberDiff line numberDiff line change
@@ -185,13 +185,7 @@ func DetectMissingDocs(schemaDiff diff.SchemaDiff, repoPath string) (map[string]
185185
return nil, fmt.Errorf("failed to parse document %s: %w", docFilePath, err)
186186
}
187187

188-
argumentsInDoc := listToMap(parser.Arguments())
189-
attributesInDoc := listToMap(parser.Attributes())
190-
for _, m := range []map[string]bool{argumentsInDoc, attributesInDoc} {
191-
for k, v := range m {
192-
fieldsInDoc[k] = v
193-
}
194-
}
188+
fieldsInDoc = listToMap(parser.FlattenFields())
195189
// for iam resource
196190
if v, ok := fieldsInDoc["member/members"]; ok {
197191
fieldsInDoc["member"] = v
@@ -203,6 +197,10 @@ func DetectMissingDocs(schemaDiff diff.SchemaDiff, repoPath string) (map[string]
203197
if !isNewField(fieldDiff) {
204198
continue
205199
}
200+
// skip condition field, check mmv1/templates/terraform/resource_iam.html.markdown.tmpl for IamConditionsRequestType
201+
if field == "condition" || strings.HasPrefix(field, "condition.") {
202+
continue
203+
}
206204
if !fieldsInDoc[field] {
207205
newFields = append(newFields, field)
208206
}
@@ -254,10 +252,10 @@ func resourceToDocFile(resource string, repoPath string) (string, error) {
254252
strings.TrimPrefix(resource, "google_") + ".html.markdown",
255253
resource + ".html.markdown",
256254
}
257-
suffix := []string{"_policy", "_binding", "_member"}
255+
suffix := []string{"_iam_policy", "_iam_binding", "_iam_member", "_iam_audit_config"}
258256
for _, s := range suffix {
259-
if strings.HasSuffix(resource, "_iam"+s) {
260-
iamName := strings.TrimSuffix(resource, s)
257+
if strings.HasSuffix(resource, s) {
258+
iamName := strings.TrimSuffix(resource, s) + "_iam"
261259
baseNameOptions = append(baseNameOptions, iamName+".html.markdown")
262260
baseNameOptions = append(baseNameOptions, strings.TrimPrefix(iamName, "google_")+".html.markdown")
263261
}
@@ -277,8 +275,8 @@ func dataSourceToDocFile(resource string, repoPath string) (string, error) {
277275
strings.TrimPrefix(resource, "google_"),
278276
resource,
279277
}
280-
// There are only iam_policy files, no iam_binding, iam_member.
281-
suffix := []string{"_iam_binding", "_iam_member"}
278+
// There are only iam_policy files, no iam_binding, iam_member, iam_audit_config.
279+
suffix := []string{"_iam_binding", "_iam_member", "iam_audit_config"}
282280
for _, s := range suffix {
283281
if strings.HasSuffix(resource, s) {
284282
iamName := strings.ReplaceAll(resource, s, "_iam_policy")

tools/diff-processor/documentparser/document_parser.go

+65-90
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,19 @@ import (
77
"strings"
88
)
99

10-
const (
11-
nestedNamePattern = `\(#(nested_[a-z0-9_]+)\)`
10+
var (
11+
fieldNameRegex = regexp.MustCompile("[\\*|-]\\s+`([a-z0-9_\\./]+)`") // * `xxx`
12+
nestedObjectRegex = regexp.MustCompile(`<a\s+name="([a-z0-9_]+)">`) // <a name="xxx">
13+
nestedHashTagRegex = regexp.MustCompile(`\(#(nested_[a-z0-9_]+)\)`) // #(nested_xxx)
14+
horizontalLineRegex = regexp.MustCompile("- - -|-{10,}") // - - - or ------------
1215

13-
itemNamePattern = "\\* `([a-z0-9_\\./]+)`"
14-
nestedLinkPattern = `<a\s+name="([a-z0-9_]+)">`
15-
16-
sectionSeparator = "## "
17-
nestedObjectSeparator = `<a name="nested_`
18-
listItemSeparator = "* `"
16+
sectionSeparator = "## "
1917
)
2018

2119
// DocumentParser parse *.html.markdown resource doc files.
2220
type DocumentParser struct {
23-
argumentRoot *node
24-
attriibuteRoot *node
21+
root *node
22+
nestedBlock map[string]string
2523
}
2624

2725
type node struct {
@@ -31,15 +29,17 @@ type node struct {
3129
}
3230

3331
func NewParser() *DocumentParser {
34-
return &DocumentParser{}
32+
return &DocumentParser{
33+
nestedBlock: make(map[string]string),
34+
}
3535
}
3636

37-
func (d *DocumentParser) Arguments() []string {
37+
func (d *DocumentParser) FlattenFields() []string {
3838
var paths []string
3939
traverse(
4040
&paths,
4141
"",
42-
d.argumentRoot,
42+
d.root,
4343
)
4444
sort.Strings(paths)
4545
return paths
@@ -63,17 +63,6 @@ func traverse(paths *[]string, path string, n *node) {
6363
}
6464
}
6565

66-
func (d *DocumentParser) Attributes() []string {
67-
var paths []string
68-
traverse(
69-
&paths,
70-
"",
71-
d.attriibuteRoot,
72-
)
73-
sort.Strings(paths)
74-
return paths
75-
}
76-
7766
// Parse parse a resource document markdown's arguments and attributes section.
7867
// The parsed file format is defined in mmv1/templates/terraform/resource.html.markdown.tmpl.
7968
func (d *DocumentParser) Parse(src []byte) error {
@@ -86,51 +75,43 @@ func (d *DocumentParser) Parse(src []byte) error {
8675
argument = p
8776
}
8877
}
89-
if len(argument) != 0 {
90-
argumentParts := strings.Split(argument, "- - -")
91-
for _, part := range argumentParts {
92-
n, err := d.parseSection(part)
93-
if err != nil {
78+
for _, text := range []string{argument, attribute} {
79+
if len(text) != 0 {
80+
sections := horizontalLineRegex.Split(text, -1)
81+
var allTopLevelFieldSections string
82+
for _, part := range sections {
83+
topLevelPropertySection, err := d.extractNestedObject(part)
84+
if err != nil {
85+
return err
86+
}
87+
allTopLevelFieldSections += topLevelPropertySection
88+
}
89+
root := &node{
90+
text: allTopLevelFieldSections,
91+
}
92+
if err := d.bfs(root, d.nestedBlock); err != nil {
9493
return err
9594
}
96-
if d.argumentRoot == nil {
97-
d.argumentRoot = n
95+
if d.root == nil {
96+
d.root = root
9897
} else {
99-
d.argumentRoot.children = append(d.argumentRoot.children, n.children...)
98+
d.root.children = append(d.root.children, root.children...)
10099
}
101100
}
102101
}
103-
if len(attribute) != 0 {
104-
n, err := d.parseSection(attribute)
105-
if err != nil {
106-
return err
107-
}
108-
d.attriibuteRoot = n
109-
}
110102
return nil
111103
}
112104

113-
func (d *DocumentParser) parseSection(input string) (*node, error) {
114-
parts := strings.Split(input, "\n"+nestedObjectSeparator)
115-
nestedBlock := make(map[string]string)
105+
func (d *DocumentParser) extractNestedObject(input string) (string, error) {
106+
parts := splitWithRegexp(input, nestedObjectRegex)
116107
for _, p := range parts[1:] {
117-
nestedName, err := findPattern(nestedObjectSeparator+p, nestedLinkPattern)
118-
if err != nil {
119-
return nil, err
120-
}
108+
nestedName := findPattern(p, nestedObjectRegex)
121109
if nestedName == "" {
122-
return nil, fmt.Errorf("could not find nested object name in %s", nestedObjectSeparator+p)
110+
return "", fmt.Errorf("could not find nested object name in %s", p)
123111
}
124-
nestedBlock[nestedName] = p
125-
}
126-
// bfs to traverse the first part without nested blocks.
127-
root := &node{
128-
text: parts[0],
112+
d.nestedBlock[nestedName] = p
129113
}
130-
if err := d.bfs(root, nestedBlock); err != nil {
131-
return nil, err
132-
}
133-
return root, nil
114+
return parts[0], nil
134115
}
135116

136117
func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
@@ -143,24 +124,22 @@ func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
143124
l := len(queue)
144125
for _, cur := range queue {
145126
// the separator should always at the beginning of the line
146-
items := strings.Split(cur.text, "\n"+listItemSeparator)
147-
for _, item := range items[1:] {
148-
text := listItemSeparator + item
149-
itemName, err := findItemName(text)
150-
if err != nil {
151-
return err
127+
parts := splitWithRegexp(cur.text, fieldNameRegex)
128+
for _, p := range parts[1:] {
129+
p = strings.ReplaceAll(p, "\n", "")
130+
fieldName := findPattern(p, fieldNameRegex)
131+
if fieldName == "" {
132+
return fmt.Errorf("could not find field name in %s", p)
152133
}
153134
// There is a special case in some hand written resource eg. in compute_instance, where its attributes is in a.0.b.0.c format.
154-
itemName = strings.ReplaceAll(itemName, ".0.", ".")
155-
nestedName, err := findNestedName(text)
156-
if err != nil {
157-
return err
158-
}
135+
fieldName = strings.ReplaceAll(fieldName, ".0.", ".")
159136
newNode := &node{
160-
name: itemName,
137+
name: fieldName,
161138
}
162139
cur.children = append(cur.children, newNode)
163-
if text, ok := nestedBlock[nestedName]; ok {
140+
141+
nestedHashTag := findPattern(p, nestedHashTagRegex)
142+
if text, ok := nestedBlock[nestedHashTag]; ok {
164143
newNode.text = text
165144
queue = append(queue, newNode)
166145
}
@@ -172,31 +151,27 @@ func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
172151
return nil
173152
}
174153

175-
func findItemName(text string) (name string, err error) {
176-
name, err = findPattern(text, itemNamePattern)
177-
if err != nil {
178-
return "", err
179-
}
180-
if name == "" {
181-
return "", fmt.Errorf("cannot find item name from %s", text)
154+
func findPattern(text string, re *regexp.Regexp) string {
155+
match := re.FindStringSubmatch(text)
156+
if match != nil {
157+
return match[1]
182158
}
183-
return
159+
return ""
184160
}
185161

186-
func findPattern(text string, pattern string) (string, error) {
187-
re, err := regexp.Compile(pattern)
188-
if err != nil {
189-
return "", err
162+
func splitWithRegexp(text string, re *regexp.Regexp) []string {
163+
matches := re.FindAllStringIndex(text, -1)
164+
if len(matches) == 0 {
165+
return []string{text}
190166
}
191-
match := re.FindStringSubmatch(text)
167+
var parts []string
168+
start := 0
169+
for _, match := range matches {
170+
end := match[0]
192171

193-
if match != nil {
194-
return match[1], nil
172+
parts = append(parts, text[start:end])
173+
start = end
195174
}
196-
return "", nil
197-
}
198-
199-
func findNestedName(text string) (string, error) {
200-
s := strings.ReplaceAll(text, "\n", "")
201-
return findPattern(s, nestedNamePattern)
175+
parts = append(parts, text[start:])
176+
return parts
202177
}

tools/diff-processor/documentparser/document_parser_test.go

+31-10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package documentparser
33
import (
44
"os"
55
"sort"
6+
"strings"
67
"testing"
78

89
"github.com/google/go-cmp/cmp"
@@ -17,7 +18,8 @@ func TestParse(t *testing.T) {
1718
if err := parser.Parse(b); err != nil {
1819
t.Fatal(err)
1920
}
20-
wantArguments := []string{
21+
want := []string{
22+
// The below are from arguments section.
2123
"boot_disk",
2224
"boot_disk.auto_delete",
2325
"boot_disk.device_name",
@@ -57,6 +59,8 @@ func TestParse(t *testing.T) {
5759
"network_interface.queue_count",
5860
"network_interface.security_policy",
5961
"network_interface.stack_type",
62+
"network_interface.subnetwork",
63+
"network_interface.subnetwork_project",
6064
"params",
6165
// "params.resource_manager_tags", // params text does not include a nested tag
6266
"zone",
@@ -65,8 +69,7 @@ func TestParse(t *testing.T) {
6569
"traffic_port_selector",
6670
"traffic_port_selector.ports",
6771
"project",
68-
}
69-
wantAttributes := []string{
72+
// The below are from attributes section.
7073
"id",
7174
"network_interface.access_config.nat_ip",
7275
"workload_identity_config",
@@ -76,17 +79,14 @@ func TestParse(t *testing.T) {
7679
"workload_identity_config.workload_pool",
7780
"errors.message",
7881
}
79-
gotArguments := parser.Arguments()
80-
gotAttributes := parser.Attributes()
81-
for _, arr := range [][]string{gotArguments, wantArguments, gotAttributes, wantAttributes} {
82+
got := parser.FlattenFields()
83+
// gotAttributes := parser.Attributes()
84+
for _, arr := range [][]string{got, want} {
8285
sort.Strings(arr)
8386
}
84-
if diff := cmp.Diff(wantArguments, gotArguments); diff != "" {
87+
if diff := cmp.Diff(want, got); diff != "" {
8588
t.Errorf("Parse returned diff in arguments(-want, +got): %s", diff)
8689
}
87-
if diff := cmp.Diff(wantAttributes, gotAttributes); diff != "" {
88-
t.Errorf("Parse returned diff in attributes(-want, +got): %s", diff)
89-
}
9090
}
9191

9292
func TestTraverse(t *testing.T) {
@@ -114,3 +114,24 @@ func TestTraverse(t *testing.T) {
114114
t.Errorf("traverse returned diff(-want, +got): %s", diff)
115115
}
116116
}
117+
118+
func TestSplitWithRegexp(t *testing.T) {
119+
paragraph := []string{
120+
"Lorem ipsum",
121+
"* `name` - (Required) Resource name.",
122+
"",
123+
"* `os_policies` - (Required) List of OS policies to be applied to the VMs. Structure is [documented below](#nested_os_policies). ",
124+
"- `some_field` - (Required) Lorem ipsum. ",
125+
}
126+
127+
got := splitWithRegexp(strings.Join(paragraph, "\n"), fieldNameRegex)
128+
want := []string{
129+
"Lorem ipsum\n",
130+
"* `name` - (Required) Resource name.\n\n",
131+
"* `os_policies` - (Required) List of OS policies to be applied to the VMs. Structure is [documented below](#nested_os_policies). \n",
132+
"- `some_field` - (Required) Lorem ipsum. ",
133+
}
134+
if diff := cmp.Diff(want, got); diff != "" {
135+
t.Errorf("splitWithRegexp returned diff(-want, +got): %s", diff)
136+
}
137+
}

tools/diff-processor/testdata/resource.html.markdown

+1-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ specified, then this instance will have no external IPv6 Internet access. Struct
210210
- - -
211211

212212

213-
* `labels` -
213+
- `labels` -
214214
(Optional)
215215
Set of label tags associated with the TcpRoute resource.
216216
**Note**: This field is non-authoritative, and will only manage the labels present in your configuration.

0 commit comments

Comments
 (0)