Skip to content

Commit b9f8874

Browse files
authored
build_generation: Fix invalid response handling and llm model handling (#1018)
This PR addresses two issues that impact the success rate of the Auto Discovery Build Generation Agent: 1. Occasionally, the LLM returns invalid results with incorrect tags for parsing. This PR introduces a new prompt to explain the situation to the LLM and instruct it to correct the response. 2. The LLM model was previously instantiated only once for all projects and trials, resulting in a bug where chat history from previous projects was incorrectly preserved and influenced subsequent LLM processes. This PR resolves the issue by reinitialising the LLM model for each project trial. Signed-off-by: Arthur Chan <[email protected]>
1 parent 5229eaf commit b9f8874

File tree

3 files changed

+60
-15
lines changed

3 files changed

+60
-15
lines changed

experimental/build_generator/llm_agent.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(self,
5151
self.build_files = {}
5252
self.last_status = False
5353
self.last_result = ''
54+
self.invalid = False
5455
self.target_files = {}
5556
self.discovery_stage = False
5657

@@ -88,6 +89,7 @@ def _container_handle_bash_commands(self, response: str, tool: BaseTool,
8889
# Initialise variables
8990
prompt_text = ''
9091
success = False
92+
self.invalid = False
9193

9294
# Retrieve data from response
9395
harness = self._parse_tag(response, 'fuzzer')
@@ -135,6 +137,8 @@ def _container_handle_bash_commands(self, response: str, tool: BaseTool,
135137

136138
if result.returncode == 0:
137139
success = True
140+
else:
141+
self.invalid = True
138142

139143
self.last_status = success
140144
self.last_result = prompt_text
@@ -149,6 +153,10 @@ def _container_handle_conclusion(self, cur_round: int, response: str,
149153
cur_round,
150154
trial=build_result.trial)
151155

156+
# Don't need to check for invalid result
157+
if self.invalid:
158+
return prompt
159+
152160
# Execution fail
153161
if not self.last_status:
154162
retry = templates.LLM_RETRY.replace('{BASH_RESULT}', self.last_result)
@@ -390,6 +398,7 @@ def _initial_prompt(self, results: list[Result]) -> Prompt: # pylint: disable=u
390398
problem = templates.LLM_AUTO_DISCOVERY
391399
problem = problem.replace('{PROJECT_NAME}', self.github_url.split('/')[-1])
392400
problem = problem.replace('{DOCKERFILE}', dockerfile_str)
401+
problem = problem.replace('{FUZZER}', self.harness_code)
393402
problem = problem.replace('{MAX_DISCOVERY_ROUND}', str(MAX_DISCOVERY_ROUND))
394403
problem = problem.replace('{FUZZING_FILE}',
395404
self.harness_path.split('/')[-1])
@@ -399,6 +408,18 @@ def _initial_prompt(self, results: list[Result]) -> Prompt: # pylint: disable=u
399408

400409
return prompt
401410

411+
def _container_handle_invalid_tool_usage(self, tool: BaseTool, cur_round: int,
412+
response: str,
413+
prompt: Prompt) -> Prompt:
414+
"""Formats a prompt to re-teach LLM how to use the |tool|."""
415+
logger.warning('ROUND %02d Invalid response from LLM: %s',
416+
cur_round,
417+
response,
418+
trial=self.trial)
419+
prompt.add_problem(templates.LLM_NO_VALID_TAG)
420+
421+
return prompt
422+
402423
def _container_tool_reaction(self, cur_round: int, response: str,
403424
build_result: BuildResult) -> Optional[Prompt]:
404425
"""Validates LLM conclusion or executes its command."""
@@ -421,7 +442,7 @@ def _container_tool_reaction(self, cur_round: int, response: str,
421442
if prompt is None:
422443
return None
423444

424-
if not response or not prompt or not prompt.get():
445+
if not response or not prompt.get() or self.invalid:
425446
prompt = self._container_handle_invalid_tool_usage(
426447
self.inspect_tool, cur_round, response, prompt)
427448

experimental/build_generator/runner.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -369,21 +369,10 @@ def run_agent(target_repositories: List[str], args: argparse.Namespace):
369369
# Prepare environment
370370
worker_project_name = get_next_worker_project(oss_fuzz_base)
371371

372-
# Prepare LLM model
373-
llm = models.LLM.setup(
374-
ai_binary=os.getenv('AI_BINARY', ''),
375-
name=args.model,
376-
max_tokens=4096,
377-
num_samples=1,
378-
temperature=0.4,
379-
temperature_list=[],
380-
)
381-
llm.MAX_INPUT_TOKEN = llm_agent.MAX_PROMPT_LENGTH
382-
383372
# All agents
384373
llm_agents = [
374+
llm_agent.AutoDiscoveryBuildScriptAgent,
385375
llm_agent.BuildSystemBuildScriptAgent,
386-
llm_agent.AutoDiscoveryBuildScriptAgent
387376
]
388377

389378
for target_repository in target_repositories:
@@ -399,6 +388,17 @@ def run_agent(target_repositories: List[str], args: argparse.Namespace):
399388
harness = ''
400389
build_success = False
401390
for trial in range(args.max_round):
391+
# Prepare new LLM model
392+
llm = models.LLM.setup(
393+
ai_binary=os.getenv('AI_BINARY', ''),
394+
name=args.model,
395+
max_tokens=4096,
396+
num_samples=1,
397+
temperature=0.4,
398+
temperature_list=[],
399+
)
400+
llm.MAX_INPUT_TOKEN = llm_agent.MAX_PROMPT_LENGTH
401+
402402
logger.info('Agent: %s. Round %d', llm_agent_ctr.__name__, trial)
403403
agent = llm_agent_ctr(trial=trial,
404404
llm=llm,

experimental/build_generator/templates.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@
256256
257257
The project source code is located at `$SRC/{PROJECT_NAME}` inside a Docker container running **Ubuntu 24.04**.
258258
259-
The fuzzing harness template is provided at `$SRC/{FUZZING_FILE}`.
259+
The fuzzing harness template is provided at `$SRC/{FUZZING_FILE}` and the content is shown below, which you must modified from the given template.
260260
261261
The generated build script will be executed in a **fresh session for testing**. Do **not** include any `|| true` or similar constructs to suppress errors.
262262
@@ -277,6 +277,14 @@
277277
</dockerfile>
278278
```
279279
280+
- **Template fuzzing harness:**
281+
```xml
282+
<fuzzer>
283+
{FUZZER}
284+
</fuzzer>
285+
```
286+
287+
280288
### Interaction Protocol
281289
282290
This is an **interactive process**. You do not initially know the project layout or build system. You must request commands to be run inside the Docker container to discover this information.
@@ -287,7 +295,7 @@
287295
288296
- `<command></command>` – Use to request shell commands that will be executed in the container.
289297
- `<bash></bash>` – Use only when ready to output the **final build script**.
290-
- `<fuzzer></fuzzer>` – Use to output the **modified fuzzing harness** with project-specific header includes added.
298+
- `<fuzzer></fuzzer>` – wraps the complete, modified fuzzing harness, which includes and links the binaries compiled from the target project. The result **MUST** contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
291299
292300
You may include multiple shell commands in:
293301
- A single `<command>` tag, separated by semicolons (`;`), or
@@ -324,6 +332,7 @@
324332
- Only modify the provided harness by including headers from the target project as necessary.
325333
- Do **not** add any logic, templates, function calls, or placeholders.
326334
- The harness must remain syntactically valid and must compile and link cleanly with the generated static library.
335+
- The result **MUST** contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
327336
328337
### Getting Started
329338
@@ -334,10 +343,25 @@
334343
- `find $SRC/{PROJECT_NAME} -type d -name include`
335344
336345
Your first reply should be a `<command>` block to start the investigation.
346+
And your last reply should returns the full generated build script and modified harness with the `<bash>` and `<fuzzer>` tag.
337347
'''
338348

339349
LLM_DOCKER_FEEDBACK = '''
340350
Here is the result of that command execution:
341351
342352
{RESULT}
343353
'''
354+
355+
LLM_NO_VALID_TAG = '''
356+
Your previous response is invalid.
357+
358+
To be valid, the response must meet the following requirements regarding XML tags:
359+
360+
- At least one of the following must be present:
361+
- One or more <command></command> tags containing valid shell commands.
362+
- A single <bash></bash> tag containing the complete Bash build script for compiling both the target project and the fuzzing harness.
363+
364+
- The <fuzzer></fuzzer> tag is **required only if** the fuzzing harness has been modified. If included, it must contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
365+
366+
Do not include any content outside these XML tags. Revisit your output and regenerate it with these rules strictly followed.
367+
'''

0 commit comments

Comments
 (0)