build_generation: Fix invalid response handling and llm model handling (#1018)

arthurscchan · web-flow · commit b9f887433b79 · 2025-04-25T20:56:55.000+01:00
This PR addresses two issues that impact the success rate of the Auto
Discovery Build Generation Agent:

1. Occasionally, the LLM returns invalid results with incorrect tags for
parsing. This PR introduces a new prompt to explain the situation to the
LLM and instruct it to correct the response.
2. The LLM model was previously instantiated only once for all projects
and trials, resulting in a bug where chat history from previous projects
was incorrectly preserved and influenced subsequent LLM processes. This
PR resolves the issue by reinitialising the LLM model for each project
trial.

Signed-off-by: Arthur Chan &lt;arthur.chan@adalogics.com&gt;
diff --git a/experimental/build_generator/llm_agent.py b/experimental/build_generator/llm_agent.py
@@ -51,6 +51,7 @@ def __init__(self,
     self.build_files = {}
     self.last_status = False
     self.last_result = ''
+    self.invalid = False
     self.target_files = {}
     self.discovery_stage = False
 
@@ -88,6 +89,7 @@ def _container_handle_bash_commands(self, response: str, tool: BaseTool,
     # Initialise variables
     prompt_text = ''
     success = False
+    self.invalid = False
 
     # Retrieve data from response
     harness = self._parse_tag(response, 'fuzzer')
@@ -135,6 +137,8 @@ def _container_handle_bash_commands(self, response: str, tool: BaseTool,
 
       if result.returncode == 0:
         success = True
+    else:
+      self.invalid = True
 
     self.last_status = success
     self.last_result = prompt_text
@@ -149,6 +153,10 @@ def _container_handle_conclusion(self, cur_round: int, response: str,
                 cur_round,
                 trial=build_result.trial)
 
+    # Don't need to check for invalid result
+    if self.invalid:
+      return prompt
+
     # Execution fail
     if not self.last_status:
       retry = templates.LLM_RETRY.replace('{BASH_RESULT}', self.last_result)
@@ -390,6 +398,7 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:  # pylint: disable=u
     problem = templates.LLM_AUTO_DISCOVERY
     problem = problem.replace('{PROJECT_NAME}', self.github_url.split('/')[-1])
     problem = problem.replace('{DOCKERFILE}', dockerfile_str)
+    problem = problem.replace('{FUZZER}', self.harness_code)
     problem = problem.replace('{MAX_DISCOVERY_ROUND}', str(MAX_DISCOVERY_ROUND))
     problem = problem.replace('{FUZZING_FILE}',
                               self.harness_path.split('/')[-1])
@@ -399,6 +408,18 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:  # pylint: disable=u
 
     return prompt
 
+  def _container_handle_invalid_tool_usage(self, tool: BaseTool, cur_round: int,
+                                           response: str,
+                                           prompt: Prompt) -> Prompt:
+    """Formats a prompt to re-teach LLM how to use the |tool|."""
+    logger.warning('ROUND %02d Invalid response from LLM: %s',
+                   cur_round,
+                   response,
+                   trial=self.trial)
+    prompt.add_problem(templates.LLM_NO_VALID_TAG)
+
+    return prompt
+
   def _container_tool_reaction(self, cur_round: int, response: str,
                                build_result: BuildResult) -> Optional[Prompt]:
     """Validates LLM conclusion or executes its command."""
@@ -421,7 +442,7 @@ def _container_tool_reaction(self, cur_round: int, response: str,
         if prompt is None:
           return None
 
-    if not response or not prompt or not prompt.get():
+    if not response or not prompt.get() or self.invalid:
       prompt = self._container_handle_invalid_tool_usage(
           self.inspect_tool, cur_round, response, prompt)
 
diff --git a/experimental/build_generator/runner.py b/experimental/build_generator/runner.py
@@ -369,21 +369,10 @@ def run_agent(target_repositories: List[str], args: argparse.Namespace):
   # Prepare environment
   worker_project_name = get_next_worker_project(oss_fuzz_base)
 
-  # Prepare LLM model
-  llm = models.LLM.setup(
-      ai_binary=os.getenv('AI_BINARY', ''),
-      name=args.model,
-      max_tokens=4096,
-      num_samples=1,
-      temperature=0.4,
-      temperature_list=[],
-  )
-  llm.MAX_INPUT_TOKEN = llm_agent.MAX_PROMPT_LENGTH
-
   # All agents
   llm_agents = [
+      llm_agent.AutoDiscoveryBuildScriptAgent,
       llm_agent.BuildSystemBuildScriptAgent,
-      llm_agent.AutoDiscoveryBuildScriptAgent
   ]
 
   for target_repository in target_repositories:
@@ -399,6 +388,17 @@ def run_agent(target_repositories: List[str], args: argparse.Namespace):
       harness = ''
       build_success = False
       for trial in range(args.max_round):
+        # Prepare new LLM model
+        llm = models.LLM.setup(
+            ai_binary=os.getenv('AI_BINARY', ''),
+            name=args.model,
+            max_tokens=4096,
+            num_samples=1,
+            temperature=0.4,
+            temperature_list=[],
+        )
+        llm.MAX_INPUT_TOKEN = llm_agent.MAX_PROMPT_LENGTH
+
         logger.info('Agent: %s. Round %d', llm_agent_ctr.__name__, trial)
         agent = llm_agent_ctr(trial=trial,
                               llm=llm,
diff --git a/experimental/build_generator/templates.py b/experimental/build_generator/templates.py
@@ -256,7 +256,7 @@
 
 The project source code is located at `$SRC/{PROJECT_NAME}` inside a Docker container running **Ubuntu 24.04**.
 
-The fuzzing harness template is provided at `$SRC/{FUZZING_FILE}`.
+The fuzzing harness template is provided at `$SRC/{FUZZING_FILE}` and the content is shown below, which you must modified from the given template.
 
 The generated build script will be executed in a **fresh session for testing**. Do **not** include any `|| true` or similar constructs to suppress errors.
 
@@ -277,6 +277,14 @@
   </dockerfile>
   ```
 
+- **Template fuzzing harness:**
+  ```xml
+  <fuzzer>
+  {FUZZER}
+  </fuzzer>
+  ```
+
+
 ### Interaction Protocol
 
 This is an **interactive process**. You do not initially know the project layout or build system. You must request commands to be run inside the Docker container to discover this information.
@@ -287,7 +295,7 @@
 
 - `<command></command>` – Use to request shell commands that will be executed in the container.
 - `<bash></bash>` – Use only when ready to output the **final build script**.
-- `<fuzzer></fuzzer>` – Use to output the **modified fuzzing harness** with project-specific header includes added.
+- `<fuzzer></fuzzer>` – wraps the complete, modified fuzzing harness, which includes and links the binaries compiled from the target project. The result **MUST** contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
 
 You may include multiple shell commands in:
 - A single `<command>` tag, separated by semicolons (`;`), or
@@ -324,6 +332,7 @@
 - Only modify the provided harness by including headers from the target project as necessary.
 - Do **not** add any logic, templates, function calls, or placeholders.
 - The harness must remain syntactically valid and must compile and link cleanly with the generated static library.
+- The result **MUST** contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
 
 ### Getting Started
 
@@ -334,10 +343,25 @@
 - `find $SRC/{PROJECT_NAME} -type d -name include`
 
 Your first reply should be a `<command>` block to start the investigation.
+And your last reply should returns the full generated build script and modified harness with the `<bash>` and `<fuzzer>` tag.
 '''
 
 LLM_DOCKER_FEEDBACK = '''
 Here is the result of that command execution:
 
 {RESULT}
 '''
+
+LLM_NO_VALID_TAG = '''
+Your previous response is invalid.
+
+To be valid, the response must meet the following requirements regarding XML tags:
+
+- At least one of the following must be present:
+  - One or more <command></command> tags containing valid shell commands.
+  - A single <bash></bash> tag containing the complete Bash build script for compiling both the target project and the fuzzing harness.
+
+- The <fuzzer></fuzzer> tag is **required only if** the fuzzing harness has been modified. If included, it must contain the **entire source code** of the updated fuzzing harness, not just a diff or partial snippet.
+
+Do not include any content outside these XML tags. Revisit your output and regenerate it with these rules strictly followed.
+'''