Refactor document parser to include more corner cases (#13175)

iyabchen · web-flow · commit 78a3a2a4d576 · 2025-03-06T22:20:15.000Z
diff --git a/tools/diff-processor/detector/detector.go b/tools/diff-processor/detector/detector.go
@@ -185,13 +185,7 @@ func DetectMissingDocs(schemaDiff diff.SchemaDiff, repoPath string) (map[string]
 				return nil, fmt.Errorf("failed to parse document %s: %w", docFilePath, err)
 			}
 
-			argumentsInDoc := listToMap(parser.Arguments())
-			attributesInDoc := listToMap(parser.Attributes())
-			for _, m := range []map[string]bool{argumentsInDoc, attributesInDoc} {
-				for k, v := range m {
-					fieldsInDoc[k] = v
-				}
-			}
+			fieldsInDoc = listToMap(parser.FlattenFields())
 			// for iam resource
 			if v, ok := fieldsInDoc["member/members"]; ok {
 				fieldsInDoc["member"] = v
@@ -203,6 +197,10 @@ func DetectMissingDocs(schemaDiff diff.SchemaDiff, repoPath string) (map[string]
 			if !isNewField(fieldDiff) {
 				continue
 			}
+			// skip condition field, check mmv1/templates/terraform/resource_iam.html.markdown.tmpl for IamConditionsRequestType
+			if field == "condition" || strings.HasPrefix(field, "condition.") {
+				continue
+			}
 			if !fieldsInDoc[field] {
 				newFields = append(newFields, field)
 			}
@@ -254,10 +252,10 @@ func resourceToDocFile(resource string, repoPath string) (string, error) {
 		strings.TrimPrefix(resource, "google_") + ".html.markdown",
 		resource + ".html.markdown",
 	}
-	suffix := []string{"_policy", "_binding", "_member"}
+	suffix := []string{"_iam_policy", "_iam_binding", "_iam_member", "_iam_audit_config"}
 	for _, s := range suffix {
-		if strings.HasSuffix(resource, "_iam"+s) {
-			iamName := strings.TrimSuffix(resource, s)
+		if strings.HasSuffix(resource, s) {
+			iamName := strings.TrimSuffix(resource, s) + "_iam"
 			baseNameOptions = append(baseNameOptions, iamName+".html.markdown")
 			baseNameOptions = append(baseNameOptions, strings.TrimPrefix(iamName, "google_")+".html.markdown")
 		}
@@ -277,8 +275,8 @@ func dataSourceToDocFile(resource string, repoPath string) (string, error) {
 		strings.TrimPrefix(resource, "google_"),
 		resource,
 	}
-	// There are only iam_policy files, no iam_binding, iam_member.
-	suffix := []string{"_iam_binding", "_iam_member"}
+	// There are only iam_policy files, no iam_binding, iam_member, iam_audit_config.
+	suffix := []string{"_iam_binding", "_iam_member", "iam_audit_config"}
 	for _, s := range suffix {
 		if strings.HasSuffix(resource, s) {
 			iamName := strings.ReplaceAll(resource, s, "_iam_policy")
diff --git a/tools/diff-processor/documentparser/document_parser.go b/tools/diff-processor/documentparser/document_parser.go
@@ -7,21 +7,19 @@ import (
 	"strings"
 )
 
-const (
-	nestedNamePattern = `\(#(nested_[a-z0-9_]+)\)`
+var (
+	fieldNameRegex      = regexp.MustCompile("[\\*|-]\\s+`([a-z0-9_\\./]+)`") // * `xxx`
+	nestedObjectRegex   = regexp.MustCompile(`<a\s+name="([a-z0-9_]+)">`)     // <a name="xxx">
+	nestedHashTagRegex  = regexp.MustCompile(`\(#(nested_[a-z0-9_]+)\)`)      // #(nested_xxx)
+	horizontalLineRegex = regexp.MustCompile("- - -|-{10,}")                  // - - - or ------------
 
-	itemNamePattern   = "\\* `([a-z0-9_\\./]+)`"
-	nestedLinkPattern = `<a\s+name="([a-z0-9_]+)">`
-
-	sectionSeparator      = "## "
-	nestedObjectSeparator = `<a name="nested_`
-	listItemSeparator     = "* `"
+	sectionSeparator = "## "
 )
 
 // DocumentParser parse *.html.markdown resource doc files.
 type DocumentParser struct {
-	argumentRoot   *node
-	attriibuteRoot *node
+	root        *node
+	nestedBlock map[string]string
 }
 
 type node struct {
@@ -31,15 +29,17 @@ type node struct {
 }
 
 func NewParser() *DocumentParser {
-	return &DocumentParser{}
+	return &DocumentParser{
+		nestedBlock: make(map[string]string),
+	}
 }
 
-func (d *DocumentParser) Arguments() []string {
+func (d *DocumentParser) FlattenFields() []string {
 	var paths []string
 	traverse(
 		&paths,
 		"",
-		d.argumentRoot,
+		d.root,
 	)
 	sort.Strings(paths)
 	return paths
@@ -63,17 +63,6 @@ func traverse(paths *[]string, path string, n *node) {
 	}
 }
 
-func (d *DocumentParser) Attributes() []string {
-	var paths []string
-	traverse(
-		&paths,
-		"",
-		d.attriibuteRoot,
-	)
-	sort.Strings(paths)
-	return paths
-}
-
 // Parse parse a resource document markdown's arguments and attributes section.
 // The parsed file format is defined in mmv1/templates/terraform/resource.html.markdown.tmpl.
 func (d *DocumentParser) Parse(src []byte) error {
@@ -86,51 +75,43 @@ func (d *DocumentParser) Parse(src []byte) error {
 			argument = p
 		}
 	}
-	if len(argument) != 0 {
-		argumentParts := strings.Split(argument, "- - -")
-		for _, part := range argumentParts {
-			n, err := d.parseSection(part)
-			if err != nil {
+	for _, text := range []string{argument, attribute} {
+		if len(text) != 0 {
+			sections := horizontalLineRegex.Split(text, -1)
+			var allTopLevelFieldSections string
+			for _, part := range sections {
+				topLevelPropertySection, err := d.extractNestedObject(part)
+				if err != nil {
+					return err
+				}
+				allTopLevelFieldSections += topLevelPropertySection
+			}
+			root := &node{
+				text: allTopLevelFieldSections,
+			}
+			if err := d.bfs(root, d.nestedBlock); err != nil {
 				return err
 			}
-			if d.argumentRoot == nil {
-				d.argumentRoot = n
+			if d.root == nil {
+				d.root = root
 			} else {
-				d.argumentRoot.children = append(d.argumentRoot.children, n.children...)
+				d.root.children = append(d.root.children, root.children...)
 			}
 		}
 	}
-	if len(attribute) != 0 {
-		n, err := d.parseSection(attribute)
-		if err != nil {
-			return err
-		}
-		d.attriibuteRoot = n
-	}
 	return nil
 }
 
-func (d *DocumentParser) parseSection(input string) (*node, error) {
-	parts := strings.Split(input, "\n"+nestedObjectSeparator)
-	nestedBlock := make(map[string]string)
+func (d *DocumentParser) extractNestedObject(input string) (string, error) {
+	parts := splitWithRegexp(input, nestedObjectRegex)
 	for _, p := range parts[1:] {
-		nestedName, err := findPattern(nestedObjectSeparator+p, nestedLinkPattern)
-		if err != nil {
-			return nil, err
-		}
+		nestedName := findPattern(p, nestedObjectRegex)
 		if nestedName == "" {
-			return nil, fmt.Errorf("could not find nested object name in %s", nestedObjectSeparator+p)
+			return "", fmt.Errorf("could not find nested object name in %s", p)
 		}
-		nestedBlock[nestedName] = p
-	}
-	// bfs to traverse the first part without nested blocks.
-	root := &node{
-		text: parts[0],
+		d.nestedBlock[nestedName] = p
 	}
-	if err := d.bfs(root, nestedBlock); err != nil {
-		return nil, err
-	}
-	return root, nil
+	return parts[0], nil
 }
 
 func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
@@ -143,24 +124,22 @@ func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
 		l := len(queue)
 		for _, cur := range queue {
 			// the separator should always at the beginning of the line
-			items := strings.Split(cur.text, "\n"+listItemSeparator)
-			for _, item := range items[1:] {
-				text := listItemSeparator + item
-				itemName, err := findItemName(text)
-				if err != nil {
-					return err
+			parts := splitWithRegexp(cur.text, fieldNameRegex)
+			for _, p := range parts[1:] {
+				p = strings.ReplaceAll(p, "\n", "")
+				fieldName := findPattern(p, fieldNameRegex)
+				if fieldName == "" {
+					return fmt.Errorf("could not find field name in %s", p)
 				}
 				// There is a special case in some hand written resource eg. in compute_instance, where its attributes is in a.0.b.0.c format.
-				itemName = strings.ReplaceAll(itemName, ".0.", ".")
-				nestedName, err := findNestedName(text)
-				if err != nil {
-					return err
-				}
+				fieldName = strings.ReplaceAll(fieldName, ".0.", ".")
 				newNode := &node{
-					name: itemName,
+					name: fieldName,
 				}
 				cur.children = append(cur.children, newNode)
-				if text, ok := nestedBlock[nestedName]; ok {
+
+				nestedHashTag := findPattern(p, nestedHashTagRegex)
+				if text, ok := nestedBlock[nestedHashTag]; ok {
 					newNode.text = text
 					queue = append(queue, newNode)
 				}
@@ -172,31 +151,27 @@ func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error {
 	return nil
 }
 
-func findItemName(text string) (name string, err error) {
-	name, err = findPattern(text, itemNamePattern)
-	if err != nil {
-		return "", err
-	}
-	if name == "" {
-		return "", fmt.Errorf("cannot find item name from %s", text)
+func findPattern(text string, re *regexp.Regexp) string {
+	match := re.FindStringSubmatch(text)
+	if match != nil {
+		return match[1]
 	}
-	return
+	return ""
 }
 
-func findPattern(text string, pattern string) (string, error) {
-	re, err := regexp.Compile(pattern)
-	if err != nil {
-		return "", err
+func splitWithRegexp(text string, re *regexp.Regexp) []string {
+	matches := re.FindAllStringIndex(text, -1)
+	if len(matches) == 0 {
+		return []string{text}
 	}
-	match := re.FindStringSubmatch(text)
+	var parts []string
+	start := 0
+	for _, match := range matches {
+		end := match[0]
 
-	if match != nil {
-		return match[1], nil
+		parts = append(parts, text[start:end])
+		start = end
 	}
-	return "", nil
-}
-
-func findNestedName(text string) (string, error) {
-	s := strings.ReplaceAll(text, "\n", "")
-	return findPattern(s, nestedNamePattern)
+	parts = append(parts, text[start:])
+	return parts
 }
diff --git a/tools/diff-processor/documentparser/document_parser_test.go b/tools/diff-processor/documentparser/document_parser_test.go
@@ -3,6 +3,7 @@ package documentparser
 import (
 	"os"
 	"sort"
+	"strings"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -17,7 +18,8 @@ func TestParse(t *testing.T) {
 	if err := parser.Parse(b); err != nil {
 		t.Fatal(err)
 	}
-	wantArguments := []string{
+	want := []string{
+		// The below are from arguments section.
 		"boot_disk",
 		"boot_disk.auto_delete",
 		"boot_disk.device_name",
@@ -57,6 +59,8 @@ func TestParse(t *testing.T) {
 		"network_interface.queue_count",
 		"network_interface.security_policy",
 		"network_interface.stack_type",
+		"network_interface.subnetwork",
+		"network_interface.subnetwork_project",
 		"params",
 		// "params.resource_manager_tags", // params text does not include a nested tag
 		"zone",
@@ -65,8 +69,7 @@ func TestParse(t *testing.T) {
 		"traffic_port_selector",
 		"traffic_port_selector.ports",
 		"project",
-	}
-	wantAttributes := []string{
+		// The below are from attributes section.
 		"id",
 		"network_interface.access_config.nat_ip",
 		"workload_identity_config",
@@ -76,17 +79,14 @@ func TestParse(t *testing.T) {
 		"workload_identity_config.workload_pool",
 		"errors.message",
 	}
-	gotArguments := parser.Arguments()
-	gotAttributes := parser.Attributes()
-	for _, arr := range [][]string{gotArguments, wantArguments, gotAttributes, wantAttributes} {
+	got := parser.FlattenFields()
+	// gotAttributes := parser.Attributes()
+	for _, arr := range [][]string{got, want} {
 		sort.Strings(arr)
 	}
-	if diff := cmp.Diff(wantArguments, gotArguments); diff != "" {
+	if diff := cmp.Diff(want, got); diff != "" {
 		t.Errorf("Parse returned diff in arguments(-want, +got): %s", diff)
 	}
-	if diff := cmp.Diff(wantAttributes, gotAttributes); diff != "" {
-		t.Errorf("Parse returned diff in attributes(-want, +got): %s", diff)
-	}
 }
 
 func TestTraverse(t *testing.T) {
@@ -114,3 +114,24 @@ func TestTraverse(t *testing.T) {
 		t.Errorf("traverse returned diff(-want, +got): %s", diff)
 	}
 }
+
+func TestSplitWithRegexp(t *testing.T) {
+	paragraph := []string{
+		"Lorem ipsum",
+		"*   `name` - (Required) Resource name.",
+		"",
+		"* `os_policies` - (Required) List of OS policies to be applied to the VMs. Structure is [documented below](#nested_os_policies).	",
+		"-   `some_field` - (Required) Lorem ipsum.	",
+	}
+
+	got := splitWithRegexp(strings.Join(paragraph, "\n"), fieldNameRegex)
+	want := []string{
+		"Lorem ipsum\n",
+		"*   `name` - (Required) Resource name.\n\n",
+		"* `os_policies` - (Required) List of OS policies to be applied to the VMs. Structure is [documented below](#nested_os_policies).	\n",
+		"-   `some_field` - (Required) Lorem ipsum.	",
+	}
+	if diff := cmp.Diff(want, got); diff != "" {
+		t.Errorf("splitWithRegexp returned diff(-want, +got): %s", diff)
+	}
+}
diff --git a/tools/diff-processor/testdata/resource.html.markdown b/tools/diff-processor/testdata/resource.html.markdown
@@ -210,7 +210,7 @@ specified, then this instance will have no external IPv6 Internet access. Struct
 - - -
 
 
-* `labels` -
+- `labels` -
   (Optional)
   Set of label tags associated with the TcpRoute resource.
   **Note**: This field is non-authoritative, and will only manage the labels present in your configuration.