Skip to content

Commit aa17d2b

Browse files
committed
Print as text if mostly text
The previous heuristic of treating strings as binary data if it contains any invalid UTF-8 was too strict. Loosen the heuristic to check if most of the characters are printable text. Fixes #257
1 parent 8fa37b4 commit aa17d2b

File tree

3 files changed

+40
-15
lines changed

3 files changed

+40
-15
lines changed

cmp/compare_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,11 @@ using the AllowUnexported option.`, "\n"),
13021302
x: struct{ X interface{} }{[1]string{"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam sit amet pretium ligula, at gravida quam. Integer iaculis, velit at sagittis ultricies, lacus metus scelerisque turpis, ornare feugiat nulla nisl ac erat. Maecenas elementum ultricies libero, sed efficitur lacus molestie non. Nulla ac pretium dolor. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Pellentesque mi lorem, consectetur id porttitor id, sollicitudin sit amet enim. Duis eu dolor magna. Nunc ut augue turpis."}},
13031303
y: struct{ X interface{} }{[1]string{"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam sit amet pretium ligula, at gravida quam. Integer iaculis, velit at sagittis ultricies, lacus metus scelerisque turpis, ornare feugiat nulla nisl ac erat. Maecenas elementum ultricies libero, sed efficitur lacus molestie non. Nulla ac pretium dolor. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Pellentesque mi lorem, consectetur id porttitor id, sollicitudin sit amet enim. Duis eu dolor magna. Nunc ut augue turpis,"}},
13041304
reason: "printing a large standalone string that is different should print enough context to see the difference",
1305+
}, {
1306+
label: label + "/MostlyTextString",
1307+
x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n",
1308+
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n",
1309+
reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text",
13051310
}}
13061311
}
13071312

cmp/report_slices.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -96,29 +96,28 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
9696
}
9797

9898
// Auto-detect the type of the data.
99-
var isLinedText, isText, isBinary bool
10099
var sx, sy string
100+
var isString, isMostlyText, isPureLinedText, isBinary bool
101101
switch {
102102
case t.Kind() == reflect.String:
103103
sx, sy = vx.String(), vy.String()
104-
isText = true // Initial estimate, verify later
104+
isString = true
105105
case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)):
106106
sx, sy = string(vx.Bytes()), string(vy.Bytes())
107-
isBinary = true // Initial estimate, verify later
107+
isString = true
108108
case t.Kind() == reflect.Array:
109109
// Arrays need to be addressable for slice operations to work.
110110
vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem()
111111
vx2.Set(vx)
112112
vy2.Set(vy)
113113
vx, vy = vx2, vy2
114114
}
115-
if isText || isBinary {
116-
var numLines, lastLineIdx, maxLineLen int
117-
isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy)
115+
if isString {
116+
var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int
118117
for i, r := range sx + sy {
119-
if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError {
120-
isBinary = true
121-
break
118+
numTotalRunes++
119+
if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError {
120+
numValidRunes++
122121
}
123122
if r == '\n' {
124123
if maxLineLen < i-lastLineIdx {
@@ -128,8 +127,10 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
128127
numLines++
129128
}
130129
}
131-
isText = !isBinary
132-
isLinedText = isText && numLines >= 4 && maxLineLen <= 1024
130+
isPureText := numValidRunes == numTotalRunes
131+
isMostlyText = float64(numValidRunes)/float64(numTotalRunes) > 0.95
132+
isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024
133+
isBinary = !isMostlyText
133134
}
134135

135136
// Format the string into printable records.
@@ -138,7 +139,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
138139
switch {
139140
// If the text appears to be multi-lined text,
140141
// then perform differencing across individual lines.
141-
case isLinedText:
142+
case isPureLinedText:
142143
ssx := strings.Split(sx, "\n")
143144
ssy := strings.Split(sy, "\n")
144145
list = opts.formatDiffSlice(
@@ -229,15 +230,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
229230
// If the text appears to be single-lined text,
230231
// then perform differencing in approximately fixed-sized chunks.
231232
// The output is printed as quoted strings.
232-
case isText:
233+
case isMostlyText:
233234
list = opts.formatDiffSlice(
234235
reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte",
235236
func(v reflect.Value, d diffMode) textRecord {
236237
s := formatString(v.String())
237238
return textRecord{Diff: d, Value: textLine(s)}
238239
},
239240
)
240-
delim = ""
241241

242242
// If the text appears to be binary data,
243243
// then perform differencing in approximately fixed-sized chunks.
@@ -299,7 +299,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
299299

300300
// Wrap the output with appropriate type information.
301301
var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"}
302-
if !isText {
302+
if !isMostlyText {
303303
// The "{...}" byte-sequence literal is not valid Go syntax for strings.
304304
// Emit the type for extra clarity (e.g. "string{...}").
305305
if t.Kind() == reflect.String {

cmp/testdata/diffs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,26 @@
10461046
+ },
10471047
}
10481048
>>> TestDiff/Reporter/LargeStandaloneString
1049+
<<< TestDiff/Reporter/MostlyTextString
1050+
strings.Join({
1051+
"org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa",
1052+
- ",\xff=_value _value=2 ",
1053+
+ " _value=2 ",
1054+
"11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb",
1055+
- ",\xff=_value _value=2 2",
1056+
+ " _value=2 2",
1057+
"1\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc",
1058+
- ",\xff=_value",
1059+
" _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1060+
"=a,tag2",
1061+
- "=dd,\xff=_value",
1062+
+ "=dd",
1063+
" _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1064+
- "=c,\xff=_value",
1065+
+ "=c",
1066+
" _value=4 41\n",
1067+
}, "")
1068+
>>> TestDiff/Reporter/MostlyTextString
10491069
<<< TestDiff/EmbeddedStruct/ParentStructA/Inequal
10501070
teststructs.ParentStructA{
10511071
privateStruct: teststructs.privateStruct{

0 commit comments

Comments
 (0)