Skip to content

Commit 9c20de3

Browse files
Fix text extract collisions (#362)
* dont add empty lines to text annotations list * use heuristic char width, calculate max line width up front * prettier * changeset
1 parent 0721318 commit 9c20de3

File tree

3 files changed

+86
-86
lines changed

3 files changed

+86
-86
lines changed

Diff for: .changeset/nervous-trees-study.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
reduce collisions and improve accuracy of textExtract

Diff for: lib/handlers/extractHandler.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,9 @@ export class StagehandExtractHandler {
247247
width: box.width,
248248
height: box.height,
249249
};
250-
allAnnotations.push(annotation);
250+
if (annotation.text.length > 0) {
251+
allAnnotations.push(annotation);
252+
}
251253
}
252254
}
253255

Diff for: lib/utils.ts

+78-85
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ import { LogLine } from "../types/log";
33
import { TextAnnotation } from "../types/textannotation";
44
import { z } from "zod";
55

6+
// This is a heuristic for the width of a character in pixels. It seems to work
7+
// better than attempting to calculate character widths dynamically, which sometimes
8+
// results in collisions when placing characters on the "canvas".
9+
const HEURISTIC_CHAR_WIDTH = 5;
10+
611
export function generateId(operation: string) {
712
return crypto.createHash("sha256").update(operation).digest("hex");
813
}
@@ -26,164 +31,172 @@ export function formatText(
2631
textAnnotations: TextAnnotation[],
2732
pageWidth: number,
2833
): string {
29-
// **1:** Estimate the average character width in pixels by examining the text annotations.
30-
// If no reliable measurement is found, default to 10 pixels per character.
31-
const charWidth = estimateCharacterWidth(textAnnotations) || 10;
32-
33-
// **2:** Create a copy of textAnnotations and sort them by their vertical position (y-coordinate),
34-
// ensuring that topmost annotations appear first and bottommost appear last.
34+
// **1: Sort annotations by vertical position (y-coordinate).**
35+
// The topmost annotations appear first, the bottommost last.
3536
const sortedAnnotations = [...textAnnotations].sort(
3637
(a, b) => a.bottom_left.y - b.bottom_left.y,
3738
);
3839

39-
// **3:** Group annotations by their line position. We use a small epsilon to handle
40-
// floating-point differences. Two annotations are considered on the same line if their
41-
// y-coordinates differ by less than epsilon.
42-
const epsilon = 0.0001;
40+
// **2: Group annotations by line based on their y-coordinate.**
41+
// We use an epsilon so that very close y-values are treated as the same line.
42+
const epsilon = 1;
4343
const lineMap: Map<number, TextAnnotation[]> = new Map();
4444

4545
for (const annotation of sortedAnnotations) {
4646
let foundLineY: number | undefined;
47-
48-
// **4:** Check if the annotation belongs to an existing line group.
49-
// If so, add it to that line. Otherwise, start a new line group.
47+
// **3: Check if this annotation belongs to any existing line group.**
5048
for (const key of lineMap.keys()) {
5149
if (Math.abs(key - annotation.bottom_left.y) < epsilon) {
5250
foundLineY = key;
5351
break;
5452
}
5553
}
5654

55+
// If found, push into that line; otherwise, create a new line entry.
5756
if (foundLineY !== undefined) {
5857
lineMap.get(foundLineY)!.push(annotation);
5958
} else {
6059
lineMap.set(annotation.bottom_left.y, [annotation]);
6160
}
6261
}
6362

64-
// **5:** Extract all line keys (y-coordinates) and sort them to process lines top-to-bottom.
63+
// **4: Get all unique y-coordinates for lines and sort them top-to-bottom.**
6564
const lineYs = Array.from(lineMap.keys()).sort((a, b) => a - b);
6665

67-
// **6:** For each line, group words together and calculate the maximum normalized end position (maxNormalizedEndX).
68-
// This will help determine the necessary canvas width to accommodate all text.
69-
let maxNormalizedEndX = 0;
66+
// **5: Build an array of "final lines" (TextAnnotations[]) by grouping words for each line.**
7067
const finalLines: TextAnnotation[][] = [];
7168

7269
for (const lineY of lineYs) {
7370
const lineAnnotations = lineMap.get(lineY)!;
7471

75-
// **7:** Sort annotations in the current line by their horizontal position (x-coordinate),
76-
// ensuring left-to-right ordering.
72+
// **6: Sort annotations in the current line left-to-right by x-coordinate.**
7773
lineAnnotations.sort((a, b) => a.bottom_left.x - b.bottom_left.x);
7874

79-
// **8:** Group nearby annotations into word clusters, forming logical sentences or phrases.
75+
// **7: Group annotations into word clusters (sentences/phrases).**
8076
const groupedLineAnnotations = groupWordsInSentence(lineAnnotations);
8177

82-
// **9:** Determine how far to the right the text in this line extends, normalized by page width.
83-
// Update maxNormalizedEndX to track the widest line encountered.
84-
for (const ann of groupedLineAnnotations) {
85-
const textLengthInPx = ann.text.length * charWidth;
86-
const normalizedTextLength = textLengthInPx / pageWidth;
87-
const endX = ann.bottom_left_normalized.x + normalizedTextLength;
88-
if (endX > maxNormalizedEndX) {
89-
maxNormalizedEndX = endX;
78+
// **8: Push the grouped annotations for this line into finalLines.**
79+
finalLines.push(groupedLineAnnotations);
80+
}
81+
82+
// -------------------------
83+
// **First Pass**: Calculate the width of the longest line (in characters) up front.
84+
// We will use this to set the width of the canvas, which will reduce likelihood of collisions.
85+
// -------------------------
86+
let maxLineWidthInChars = 0;
87+
88+
for (const line of finalLines) {
89+
let lineMaxEnd = 0;
90+
for (const ann of line) {
91+
// Convert normalized X to character index
92+
const startXInChars = Math.round(
93+
ann.bottom_left_normalized.x * (pageWidth / HEURISTIC_CHAR_WIDTH),
94+
);
95+
// Each annotation spans ann.text.length characters
96+
const endXInChars = startXInChars + ann.text.length;
97+
98+
if (endXInChars > lineMaxEnd) {
99+
lineMaxEnd = endXInChars;
90100
}
91101
}
92-
93-
// **10:** Save the processed line to finalLines for later rendering.
94-
finalLines.push(groupedLineAnnotations);
102+
// Track the largest width across all lines
103+
if (lineMaxEnd > maxLineWidthInChars) {
104+
maxLineWidthInChars = lineMaxEnd;
105+
}
95106
}
96107

97-
// **11:** Determine the canvas width in characters. We scale according to maxNormalizedEndX and page width.
98-
// Add a small buffer (20 chars) to ensure no text overflows the canvas.
99-
let canvasWidth = Math.ceil(maxNormalizedEndX * (pageWidth / charWidth)) + 20;
100-
canvasWidth = Math.max(canvasWidth, 1);
108+
// **9: Add a 20-char buffer to ensure we don’t cut off text.**
109+
maxLineWidthInChars += 20;
101110

102-
// **12:** Compute the baseline (lowest point) of each line. This helps us understand vertical spacing.
111+
// **10: Determine the canvas width based on the measured maxLineWidthInChars.**
112+
const canvasWidth = Math.max(maxLineWidthInChars, 1);
113+
114+
// **11: Compute the baseline (lowest y) of each line to measure vertical spacing.**
103115
const lineBaselines = finalLines.map((line) =>
104116
Math.min(...line.map((a) => a.bottom_left.y)),
105117
);
106118

107-
// **13:** Compute vertical gaps between consecutive lines to determine line spacing.
119+
// **12: Compute the gaps between consecutive lines.**
108120
const verticalGaps: number[] = [];
109121
for (let i = 1; i < lineBaselines.length; i++) {
110122
verticalGaps.push(lineBaselines[i] - lineBaselines[i - 1]);
111123
}
112124

113-
// **14:** Estimate what a "normal" line spacing is by taking the median of all vertical gaps.
125+
// **13: Estimate a "normal" line spacing via the median of these gaps.**
114126
const normalLineSpacing = verticalGaps.length > 0 ? median(verticalGaps) : 0;
115127

116-
// **15:** Create a 2D character canvas initialized with spaces, onto which we'll "print" text lines.
128+
// **14: Create a 2D character canvas (array of arrays), filled with spaces.**
117129
let canvas: string[][] = [];
118130

119-
// **16:** lineIndex represents the current line of the canvas. Initialize with -1 so the first line starts at 0.
131+
// **15: lineIndex tracks which row of the canvas we’re on; start at -1 so the first line is index 0.**
120132
let lineIndex = -1;
121133

122-
// **17:** Iterate over each line of processed text.
134+
// **16: Render each line of text into our canvas.**
123135
for (let i = 0; i < finalLines.length; i++) {
124136
if (i === 0) {
125-
// **18:** For the first line, just increment lineIndex to start at 0 with no extra spacing.
137+
// **17: For the very first line, just increment lineIndex once.**
126138
lineIndex++;
127139
ensureLineExists(canvas, lineIndex, canvasWidth);
128140
} else {
129-
// **19:** For subsequent lines, calculate how many extra blank lines to insert based on spacing.
141+
// **18: For subsequent lines, figure out how many blank lines to insert
142+
// based on the gap between this line’s baseline and the previous line’s baseline.**
130143
const gap = lineBaselines[i] - lineBaselines[i - 1];
131144

132145
let extraLines = 0;
133-
// **20:** If we have a known normal line spacing, and the gap is larger than expected,
134-
// insert extra blank lines proportional to the ratio of gap to normal spacing.
135-
if (normalLineSpacing > 0) {
136-
if (gap > 1.2 * normalLineSpacing) {
137-
extraLines = Math.max(Math.round(gap / normalLineSpacing) - 1, 0);
138-
}
146+
// **19: If the gap is significantly larger than the "normal" spacing,
147+
// insert blank lines proportionally.**
148+
if (normalLineSpacing > 0 && gap > 1.2 * normalLineSpacing) {
149+
extraLines = Math.max(Math.round(gap / normalLineSpacing) - 1, 0);
139150
}
140151

141-
// **21:** Insert the calculated extra blank lines to maintain approximate vertical spacing.
152+
// **20: Insert the calculated extra blank lines.**
142153
for (let e = 0; e < extraLines; e++) {
143154
lineIndex++;
144155
ensureLineExists(canvas, lineIndex, canvasWidth);
145156
}
146157

147-
// **22:** After adjusting for spacing, increment lineIndex for the current line of text.
158+
// **21: Move to the next line (row) in the canvas for this line’s text.**
148159
lineIndex++;
149160
ensureLineExists(canvas, lineIndex, canvasWidth);
150161
}
151162

152-
// **23:** Now place the annotations for the current line onto the canvas at the appropriate horizontal positions.
163+
// **22: Place each annotation’s text in the correct horizontal position for this line.**
153164
const lineAnnotations = finalLines[i];
154165
for (const annotation of lineAnnotations) {
155166
const text = annotation.text;
156-
// **24:** Calculate the starting x-position in the canvas based on normalized coordinates.
167+
168+
// **23: Calculate the starting x-position in the canvas by converting normalized x to char space.**
157169
const startXInChars = Math.round(
158-
annotation.bottom_left_normalized.x * canvasWidth,
170+
annotation.bottom_left_normalized.x *
171+
(pageWidth / HEURISTIC_CHAR_WIDTH),
159172
);
160173

161-
// **25:** Place each character of the annotation text into the canvas.
174+
// **24: Place each character of the annotation in the canvas.**
162175
for (let j = 0; j < text.length; j++) {
163176
const xPos = startXInChars + j;
164-
// **26:** Ensure we don't exceed the canvas width.
177+
// **25: Don’t write beyond the right edge of the canvas.**
165178
if (xPos < canvasWidth) {
166179
canvas[lineIndex][xPos] = text[j];
167180
}
168181
}
169182
}
170183
}
171184

172-
// **27:** Trim trailing whitespace from each line to create a cleaner output.
185+
// **26: Trim trailing whitespace from each line to clean up the output.**
173186
canvas = canvas.map((row) => {
174187
const lineStr = row.join("");
175188
return Array.from(lineStr.trimEnd());
176189
});
177190

178-
// **29:** Join all lines to form the final page text. Trim any trailing whitespace from the entire text.
191+
// **27: Combine all rows into a single string, separating rows with newlines.**
179192
let pageText = canvas.map((line) => line.join("")).join("\n");
180193
pageText = pageText.trimEnd();
181194

182-
// **30:** Surround the page text with lines of dashes to clearly delineate the text block.
195+
// **28: Surround the rendered text with lines of dashes for clarity.**
183196
pageText =
184197
"-".repeat(canvasWidth) + "\n" + pageText + "\n" + "-".repeat(canvasWidth);
185198

186-
// **31:** Return the fully formatted text.
199+
// **29: Return the final formatted text.**
187200
return pageText;
188201
}
189202

@@ -208,28 +221,6 @@ function ensureLineExists(
208221
}
209222
}
210223

211-
/**
212-
* `estimateCharacterWidth` estimates the average character width (in pixels) from a collection of text annotations.
213-
* It calculates the width per character for each annotation and uses their median as the result.
214-
* If no annotations are available or they have zero-length text, returns 0.
215-
*
216-
* @param textAnnotations - An array of text annotations with text and width fields.
217-
* @returns The median character width in pixels, or 0 if none can be calculated.
218-
*/
219-
function estimateCharacterWidth(textAnnotations: TextAnnotation[]): number {
220-
// collect width-per-character measurements from each annotation
221-
const charWidths: number[] = [];
222-
for (const annotation of textAnnotations) {
223-
const length = annotation.text.length;
224-
if (length > 0) {
225-
charWidths.push(annotation.width / length);
226-
}
227-
}
228-
229-
// return the median of all collected measurements
230-
return median(charWidths);
231-
}
232-
233224
/**
234225
* `groupWordsInSentence` groups annotations within a single line into logical "words" or "sentences".
235226
* It uses a set of heuristics involving horizontal proximity and similar height
@@ -253,7 +244,7 @@ function groupWordsInSentence(
253244

254245
// determine horizontal grouping criteria
255246
// use a padding factor to allow slight spaces between words
256-
const padding = 2;
247+
const padding = 1;
257248
const lastAnn = currentGroup[currentGroup.length - 1];
258249
const characterWidth = (lastAnn.width / lastAnn.text.length) * padding;
259250
const isWithinHorizontalRange =
@@ -277,8 +268,10 @@ function groupWordsInSentence(
277268
// 3. start a new group with the current annotation
278269
if (currentGroup.length > 0) {
279270
const groupedAnnotation = createGroupedAnnotation(currentGroup);
280-
groupedAnnotations.push(groupedAnnotation);
281-
currentGroup = [annotation];
271+
if (groupedAnnotation.text.length > 0) {
272+
groupedAnnotations.push(groupedAnnotation);
273+
currentGroup = [annotation];
274+
}
282275
}
283276
}
284277
}

0 commit comments

Comments
 (0)