Skip to content

Commit 9347810

Browse files
authored
build_generation: Add LLM approach (#979)
This PR initialis the LLM approach for automatic build script generation of C/C++ projects. This is done by adding a separate path for the new LLM agent. Adding an extra flag --agent could trigger the new LLM agent flow for automatic build script generation. --------- Signed-off-by: Arthur Chan <[email protected]>
1 parent b0eda4b commit 9347810

File tree

6 files changed

+646
-108
lines changed

6 files changed

+646
-108
lines changed

experimental/build_generator/build_script_generator.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from typing import Dict, Iterator, List, Optional, Tuple
2424

2525
import constants
26-
import manager
26+
import file_utils as utils
2727

2828
logger = logging.getLogger(name=__name__)
2929

@@ -843,7 +843,7 @@ def match_build_heuristics_on_folder(abspath_of_target: str):
843843
Traverses the files in the target folder. Uses the file list as input to
844844
auto build heuristics, and for each heuristic will yield any of the
845845
build steps that are deemed matching."""
846-
all_files = manager.get_all_files_in_path(abspath_of_target)
846+
all_files = utils.get_all_files_in_path(abspath_of_target)
847847
all_checks = [
848848
AutogenConfScanner(),
849849
PureCFileCompiler(),
@@ -887,7 +887,7 @@ def match_build_heuristics_on_folder(abspath_of_target: str):
887887

888888
def get_all_binary_files_from_folder(path: str) -> Dict[str, List[str]]:
889889
"""Extracts binary artifacts from a list of files, based on file suffix."""
890-
all_files = manager.get_all_files_in_path(path, path)
890+
all_files = utils.get_all_files_in_path(path, path)
891891

892892
executable_files = {'static-libs': [], 'dynamic-libs': [], 'object-files': []}
893893
for fil in all_files:
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""File utils for target repository"""
15+
16+
import os
17+
from typing import List, Optional
18+
19+
try:
20+
# For execution outside of a docker container
21+
from experimental.build_generator import templates
22+
except (ImportError, SystemError):
23+
# For execution inside of a docker container
24+
import templates
25+
26+
27+
def determine_project_language(path: str) -> str:
28+
"""Returns the likely language of a project by looking at file suffixes."""
29+
all_files = get_all_files_in_path(path, path)
30+
31+
language_dict = {'c': 0, 'c++': 0}
32+
for source_file in all_files:
33+
if source_file.endswith('.c'):
34+
language_dict['c'] = language_dict['c'] + 1
35+
elif source_file.endswith('.cpp'):
36+
language_dict['c++'] = language_dict['c++'] + 1
37+
elif source_file.endswith('.cc'):
38+
language_dict['c++'] = language_dict['c++'] + 1
39+
40+
target_language = 'c++'
41+
max_count = 0
42+
for language, count in language_dict.items():
43+
if count > max_count:
44+
target_language = language
45+
max_count = count
46+
return target_language
47+
48+
49+
def get_language_defaults(language: str):
50+
compilers_and_flags = {
51+
'c': ('$CC', '$CFLAGS', '/src/empty-fuzzer.c', templates.C_BASE_TEMPLATE),
52+
'c++': ('$CXX', '$CXXFLAGS', '/src/empty-fuzzer.cpp',
53+
templates.CPP_BASE_TEMPLATE),
54+
}
55+
return compilers_and_flags[language]
56+
57+
58+
def get_all_files_in_path(base_path: str,
59+
path_to_subtract: Optional[str] = None) -> List[str]:
60+
"""Gets all files in a tree and returns as a list of strings."""
61+
all_files = []
62+
if path_to_subtract is None:
63+
path_to_subtract = os.getcwd()
64+
for root, _, files in os.walk(base_path):
65+
for fi in files:
66+
path = os.path.join(root, fi)
67+
if path.startswith(path_to_subtract):
68+
path = path[len(path_to_subtract):]
69+
if len(path) > 0 and path[0] == '/':
70+
path = path[1:]
71+
all_files.append(path)
72+
return all_files
+285
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""LLM Build Script Agent"""
15+
16+
import argparse
17+
import os
18+
import re
19+
import subprocess
20+
from typing import Optional
21+
22+
import logger
23+
from agent.base_agent import BaseAgent
24+
from experimental.build_generator import file_utils, templates
25+
from llm_toolkit.models import LLM
26+
from llm_toolkit.prompts import Prompt
27+
from results import BuildResult, Result
28+
from tool.base_tool import BaseTool
29+
from tool.container_tool import ProjectContainerTool
30+
31+
MAX_PROMPT_LENGTH = 20000
32+
33+
34+
class BuildScriptAgent(BaseAgent):
35+
"""Base class for buidl script agent."""
36+
37+
def __init__(self,
38+
trial: int,
39+
llm: LLM,
40+
args: argparse.Namespace,
41+
github_url: str,
42+
language: str,
43+
tools: Optional[list[BaseTool]] = None,
44+
name: str = ''):
45+
super().__init__(trial, llm, args, tools, name)
46+
self.github_url = github_url
47+
self.language = language
48+
self.build_files = {}
49+
self.last_status = False
50+
self.last_result = ''
51+
self.target_files = {}
52+
53+
# Get sample fuzzing harness
54+
_, _, self.harness_path, self.harness_code = (
55+
file_utils.get_language_defaults(self.language))
56+
57+
def _parse_tag(self, response: str, tag: str) -> str:
58+
"""Parses the tag from LLM response."""
59+
patterns = [rf'<{tag}>(.*?)</{tag}>', rf'```{tag}(.*?)```']
60+
61+
# Matches both xml and code style tags
62+
for pattern in patterns:
63+
match = re.search(pattern, response, re.DOTALL)
64+
if match:
65+
return match.group(1).strip()
66+
67+
return ''
68+
69+
def _parse_tags(self, response: str, tag: str) -> list[str]:
70+
"""Parses the tags from LLM response."""
71+
patterns = [rf'<{tag}>(.*?)</{tag}>', rf'```{tag}(.*?)```']
72+
found_matches = []
73+
74+
# Matches both xml and code style tags
75+
for pattern in patterns:
76+
matches = re.findall(pattern, response, re.DOTALL)
77+
found_matches.extend([content.strip() for content in matches])
78+
79+
return found_matches
80+
81+
def _container_handle_bash_commands(self, response: str, tool: BaseTool,
82+
prompt: Prompt) -> Prompt:
83+
"""Handles the command from LLM with container |tool|."""
84+
# Update fuzzing harness
85+
harness = self._parse_tag(response, 'fuzzer')
86+
if harness:
87+
self.harness_code = harness
88+
if isinstance(tool, ProjectContainerTool):
89+
tool.write_to_file(self.harness_code, self.harness_path)
90+
91+
# Try execute the generated build script
92+
prompt_text = ''
93+
success = True
94+
for command in self._parse_tags(response, 'bash'):
95+
result = tool.execute(command)
96+
success = success and (result.returncode == 0)
97+
format_result = self._format_bash_execution_result(result,
98+
previous_prompt=prompt)
99+
prompt_text += self._parse_tag(format_result, 'stderr') + '\n'
100+
101+
self.last_status = success
102+
self.last_result = prompt_text
103+
104+
return prompt
105+
106+
def _container_handle_conclusion(self, cur_round: int, response: str,
107+
build_result: BuildResult,
108+
prompt: Prompt) -> Optional[Prompt]:
109+
"""Runs a compilation tool to validate the new build script from LLM."""
110+
logger.info('----- ROUND %02d Received conclusion -----',
111+
cur_round,
112+
trial=build_result.trial)
113+
114+
# Execution fail
115+
if not self.last_status:
116+
retry = templates.LLM_RETRY.replace('{BASH_RESULT}', self.last_result)
117+
118+
# Refine prompt text to max prompt count and add to prompt
119+
length = min(len(retry), (MAX_PROMPT_LENGTH - len(prompt.gettext())))
120+
prompt.add_problem(retry[-length:])
121+
122+
# Store build result
123+
build_result.compiles = False
124+
build_result.compile_error = self.last_result
125+
126+
return prompt
127+
128+
# Execution success
129+
build_result.compiles = True
130+
build_result.fuzz_target_source = self.harness_code
131+
build_script_source = '\n'.join(self._parse_tags(response, 'bash'))
132+
if not build_script_source.startswith('#!'):
133+
build_script_source = templates.EMPTY_OSS_FUZZ_BUILD + build_script_source
134+
build_result.build_script_source = build_script_source
135+
136+
return None
137+
138+
def _container_tool_reaction(self, cur_round: int, response: str,
139+
build_result: BuildResult) -> Optional[Prompt]:
140+
"""Validates LLM conclusion or executes its command."""
141+
prompt = self.llm.prompt_type()(None)
142+
143+
if response:
144+
prompt = self._container_handle_bash_commands(response, self.inspect_tool,
145+
prompt)
146+
147+
# Check result and try building with the new builds script
148+
prompt = self._container_handle_conclusion(cur_round, response,
149+
build_result, prompt)
150+
151+
if prompt is None:
152+
return None
153+
154+
if not response or not prompt or not prompt.get():
155+
prompt = self._container_handle_invalid_tool_usage(
156+
self.inspect_tool, cur_round, response, prompt)
157+
158+
return prompt
159+
160+
def execute(self, result_history: list[Result]) -> BuildResult:
161+
"""Executes the agent based on previous result."""
162+
last_result = result_history[-1]
163+
logger.info('Executing %s', self.name, trial=last_result.trial)
164+
benchmark = last_result.benchmark
165+
self.inspect_tool = ProjectContainerTool(benchmark, name='inspect')
166+
self.inspect_tool.compile(extra_commands=' && rm -rf /out/* > /dev/null')
167+
cur_round = 1
168+
build_result = BuildResult(benchmark=benchmark,
169+
trial=last_result.trial,
170+
work_dirs=last_result.work_dirs,
171+
author=self,
172+
chat_history={self.name: ''})
173+
174+
prompt = self._initial_prompt(result_history)
175+
try:
176+
client = self.llm.get_chat_client(model=self.llm.get_model())
177+
while prompt and cur_round < self.max_round:
178+
response = self.chat_llm(cur_round,
179+
client=client,
180+
prompt=prompt,
181+
trial=last_result.trial)
182+
prompt = self._container_tool_reaction(cur_round, response,
183+
build_result)
184+
cur_round += 1
185+
finally:
186+
logger.info('Stopping and removing the inspect container %s',
187+
self.inspect_tool.container_id,
188+
trial=last_result.trial)
189+
self.inspect_tool.terminate()
190+
191+
return build_result
192+
193+
194+
class BuildSystemBuildScriptAgent(BuildScriptAgent):
195+
"""Generate a working Dockerfile and build script from scratch
196+
with build system."""
197+
198+
def __init__(self,
199+
trial: int,
200+
llm: LLM,
201+
args: argparse.Namespace,
202+
github_url: str,
203+
language: str,
204+
tools: Optional[list[BaseTool]] = None,
205+
name: str = ''):
206+
super().__init__(trial, llm, args, github_url, language, tools, name)
207+
self.target_files = {
208+
'Makefile': [],
209+
'configure.ac': [],
210+
'Makefile.am': [],
211+
'autogen.sh': [],
212+
'bootstrap.sh': [],
213+
'CMakeLists.txt': [],
214+
'Config.in': [],
215+
}
216+
217+
def _discover_build_configurations(self) -> bool:
218+
"""Helper to discover the build configuartions of a repository."""
219+
# Clone targert repository
220+
target_path = os.path.join(self.args.work_dirs,
221+
self.github_url.split('/')[-1])
222+
if not os.path.isdir(target_path):
223+
subprocess.check_call(
224+
f'git clone --recurse-submodules {self.github_url} {target_path}',
225+
shell=True)
226+
227+
# Locate common build configuration files
228+
for root_dir, _, files in os.walk(target_path):
229+
for file in files:
230+
if file in self.target_files:
231+
full_path = os.path.join(root_dir, file)
232+
self.target_files[file].append(full_path)
233+
234+
# Extract content of build files
235+
for files in self.target_files.values():
236+
for file in files:
237+
with open(file, 'r') as f:
238+
self.build_files[file.replace(target_path, '')] = f.read()
239+
240+
return len(self.build_files) > 0
241+
242+
def _initial_prompt(self, results: list[Result]) -> Prompt: # pylint: disable=unused-argument
243+
"""Constructs initial prompt of the agent."""
244+
prompt = self.llm.prompt_type()(None)
245+
246+
# Extract build configuration files content
247+
build_files_str = []
248+
for file, content in self.build_files.items():
249+
target_str = templates.LLM_BUILD_FILE_TEMPLATE.replace('{PATH}', file)
250+
target_str = target_str.replace('{CONTENT}', content)
251+
build_files_str.append(target_str)
252+
253+
# Extract template Dockerfile content
254+
dockerfile_str = templates.CLEAN_OSS_FUZZ_DOCKER
255+
dockerfile_str = dockerfile_str.replace('{additional_packages}', '')
256+
dockerfile_str = dockerfile_str.replace('{repo_url}', self.github_url)
257+
dockerfile_str = dockerfile_str.replace('{project_repo_dir}',
258+
self.github_url.split('/')[-1])
259+
260+
# Prepare prompt problem string
261+
problem = templates.LLM_PROBLEM.replace('{BUILD_FILES}',
262+
'\n'.join(build_files_str))
263+
problem = problem.replace('{DOCKERFILE}', dockerfile_str)
264+
problem = problem.replace('{FUZZER}', self.harness_code)
265+
problem = problem.replace('{FUZZING_FILE}',
266+
self.harness_path.split('/')[-1])
267+
268+
prompt.add_priming(templates.LLM_PRIMING)
269+
prompt.add_problem(problem)
270+
271+
return prompt
272+
273+
def execute(self, result_history: list[Result]) -> BuildResult:
274+
"""Executes the agent based on previous result."""
275+
if not self._discover_build_configurations():
276+
logger.info('No known build configuration.',
277+
self.name,
278+
trial=result_history[-1].trial)
279+
return BuildResult(benchmark=result_history[-1].benchmark,
280+
trial=result_history[-1].trial,
281+
work_dirs=result_history[-1].work_dirs,
282+
author=self,
283+
chat_history={self.name: ''})
284+
285+
return super().execute(result_history)

0 commit comments

Comments
 (0)