Skip to content

chore(profiling): add internal/cmpcompress tool #3518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ require (
github.com/go-viper/mapstructure/v2 v2.2.1
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db
github.com/google/uuid v1.6.0
github.com/klauspost/compress v1.18.0
github.com/pierrec/lz4/v4 v4.1.22
github.com/puzpuzpuz/xsync/v3 v3.5.1
github.com/richardartoul/molecule v1.0.1-0.20240531184615-7ca0df43c0b3
github.com/spaolacci/murmur3 v1.1.0
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand All @@ -110,6 +112,8 @@ github.com/outcaste-io/ristretto v0.2.3 h1:AK4zt/fJ76kjlYObOeNwh4T3asEuaCmp26pOv
github.com/outcaste-io/ristretto v0.2.3/go.mod h1:W8HywhmtlopSB1jeMg3JtdIhf+DYkLAr0VN/s4+MHac=
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c h1:dAMKvw0MlJT1GshSTtih8C2gDs04w8dReiOGXrGLNoY=
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU=
github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
Expand Down
28 changes: 28 additions & 0 deletions internal/cmpcompress/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# cmpcompress

## How it works

For each file in the given zip files, cmpcompress will:

1. Unpack the file.
2. Compress the file using different compression algorithms.
3. Compare the size of the compressed files.
4. Print the results as CSV.

## Example

```
# Compare different compression algorithms for each file in the given zip files.
cmpcompress file1.zip file2.zip ...

# Output:
src,file,algorithm,compression_ratio,speed_mb_per_sec,utility
my_archive.zip,file1.txt,gzip-1,5.23,150,784.50
my_archive.zip,file1.txt,gzip-6,8.11,120,973.20
my_archive.zip,file1.txt,gzip-9,9.50,80,760.00
my_archive.zip,file1.txt,kgzip-1,5.30,160,848.00
my_archive.zip,file1.txt,zstd-1,6.05,250,1512.50
my_archive.zip,file1.txt,zstd-2,9.80,200,1960.00
other_data.zip,image.png,gzip-6,1.10,50,55.00
other_data.zip,image.png,zstd-1,1.50,90,135.00
```
325 changes: 325 additions & 0 deletions internal/cmpcompress/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,325 @@
// WARNING: This entire file was generated by Large Language Models (LLMs).

package main

import (
"archive/zip"
"bytes"
"compress/gzip"
"fmt"
"io"
"log"
"math"
"os"
"path/filepath"
"sort"
"time"

kgzip "github.com/klauspost/compress/gzip"
"github.com/klauspost/compress/zstd"
lz4 "github.com/pierrec/lz4/v4"
)

// Define compression levels to test (keys are descriptive, values are actual levels)
var gzipLevels = map[int]int{
gzip.BestSpeed: 1,
gzip.DefaultCompression: 6,
gzip.BestCompression: 9,
}

var kgzipLevels = map[int]int{
kgzip.BestSpeed: 1,
kgzip.DefaultCompression: 6,
kgzip.BestCompression: 9,
}

var zstdLevels = map[zstd.EncoderLevel]int{
zstd.SpeedFastest: 1,
zstd.SpeedDefault: 2,
zstd.SpeedBetterCompression: 3,
zstd.SpeedBestCompression: 4,
}

// Define LZ4 levels (using 1 for standard, 9 for high compression)
var lz4Levels = map[lz4.CompressionLevel]int{
lz4.Fast: 0,
lz4.Level1: 1,
lz4.Level4: 4,
lz4.Level9: 9,
}

const numRuns = 10 // Number of times to run each compression

func main() {
if len(os.Args) < 2 {
log.Fatalf("Usage: %s <zipfile1> [zipfile2]...", os.Args[0])
}

zipFiles := os.Args[1:]

// Update header to remove level, keeping utility
fmt.Println("src,file,algorithm,compression_ratio,speed_mb_per_sec,utility")

for _, zipPath := range zipFiles {
processZipFile(zipPath)
}
}

func processZipFile(zipPath string) {
r, err := zip.OpenReader(zipPath)
if err != nil {
log.Printf("Error opening zip file %s: %v", zipPath, err)
return
}
defer r.Close()

for _, f := range r.File {
if f.FileInfo().IsDir() {
continue
}

rc, err := f.Open()
if err != nil {
log.Printf("Error opening file %s in zip %s: %v", f.Name, zipPath, err)
continue
}

originalData, err := io.ReadAll(rc)
rc.Close() // Close immediately after reading
if err != nil {
log.Printf("Error reading file %s in zip %s: %v", f.Name, zipPath, err)
continue
}

originalSize := float64(len(originalData))
if originalSize == 0 {
// Avoid division by zero for empty files
continue
}

fileName := f.Name // Store filename to pass to helper

// Run tests for each algorithm
runCompressionTest(zipPath, fileName, originalData, originalSize, "gzip", gzipLevels)
runCompressionTest(zipPath, fileName, originalData, originalSize, "kgzip", kgzipLevels)
runCompressionTest(zipPath, fileName, originalData, originalSize, "zstd", zstdLevels)
runCompressionTest(zipPath, fileName, originalData, originalSize, "lz4", lz4Levels)
}
}

// runCompressionTest performs compression runs for a specific algorithm and levels.
func runCompressionTest(zipPath, fileName string, originalData []byte, originalSize float64, algorithmName string, levels interface{}) {
processLevel := func(level interface{}, levelName int) {
var durations []time.Duration
var compressedSizes []float64
for i := 0; i < numRuns; i++ {
compSize, duration := compressData(originalData, algorithmName, level)
if compSize > 0 { // Only record successful runs
durations = append(durations, duration)
compressedSizes = append(compressedSizes, compSize)
}
}

if len(compressedSizes) == 0 {
log.Printf("%s compression failed for all %d runs for file %s, level %d",
algorithmName, numRuns, fileName, levelName)
return // Skip if all runs failed
}

medianDuration := calculateMedianDuration(durations)
avgCompressedSize := 0.0
for _, s := range compressedSizes {
avgCompressedSize += s
}
avgCompressedSize /= float64(len(compressedSizes))
combinedAlgo := fmt.Sprintf("%s-%d", algorithmName, levelName)
printResult(zipPath, fileName, combinedAlgo, originalSize, avgCompressedSize, medianDuration)
}

switch lvls := levels.(type) {
case map[int]int:
for level, name := range lvls {
processLevel(level, name)
}
case map[zstd.EncoderLevel]int:
for level, name := range lvls {
processLevel(level, name)
}
case map[lz4.CompressionLevel]int:
for level, name := range lvls {
processLevel(level, name)
}
default:
log.Printf("Unsupported levels type for algorithm %s: %T", algorithmName, levels)
}
}

// compressData function remains the same, accepting interface{} for level
func compressData(data []byte, algorithm string, level interface{}) (float64, time.Duration) {
var compressedBuf bytes.Buffer
startTime := time.Now()

switch algorithm {
case "gzip":
levelInt, ok := level.(int)
if !ok {
log.Printf("Invalid level type for gzip: %T", level)
return 0, 0
}
gzWriter, err := gzip.NewWriterLevel(&compressedBuf, levelInt)
if err != nil {
log.Printf("Error creating gzip writer (level %d): %v", levelInt, err)
return 0, 0
}
defer gzWriter.Close() // Use defer for guaranteed close
if _, err := gzWriter.Write(data); err != nil {
log.Printf("Error compressing with gzip (level %d): %v", levelInt, err)
return 0, 0
}
if err := gzWriter.Close(); err != nil { // Close explicitly before returning size
log.Printf("Error closing gzip writer (level %d): %v", levelInt, err)
return 0, 0
}
case "kgzip":
levelInt, ok := level.(int)
if !ok {
log.Printf("Invalid level type for kgzip: %T", level)
return 0, 0
}
kgzWriter, err := kgzip.NewWriterLevel(&compressedBuf, levelInt)
if err != nil {
log.Printf("Error creating kgzip writer (level %d): %v", levelInt, err)
return 0, 0
}
defer kgzWriter.Close() // Use defer
if _, err := kgzWriter.Write(data); err != nil {
log.Printf("Error compressing with kgzip (level %d): %v", levelInt, err)
return 0, 0
}
if err := kgzWriter.Close(); err != nil { // Close explicitly
log.Printf("Error closing kgzip writer (level %d): %v", levelInt, err)
return 0, 0
}
case "zstd":
levelVal, ok := level.(zstd.EncoderLevel)
if !ok {
log.Printf("Invalid level type for zstd: %T", level)
return 0, 0
}
zstdWriter, err := zstd.NewWriter(&compressedBuf, zstd.WithEncoderLevel(levelVal))
if err != nil {
log.Printf("Error creating zstd writer (level %s): %v", levelVal.String(), err) // Use String() for logging level
return 0, 0
}
defer zstdWriter.Close() // Use defer
if _, err := zstdWriter.Write(data); err != nil {
log.Printf("Error compressing with zstd (level %s): %v", levelVal.String(), err) // Use String() for logging level
return 0, 0
}
if err := zstdWriter.Close(); err != nil { // Close explicitly
log.Printf("Error closing zstd writer (level %s): %v", levelVal.String(), err) // Use String() for logging level
return 0, 0
}
case "lz4":
levelOpt, ok := level.(lz4.CompressionLevel)
if !ok {
log.Printf("Invalid level type for lz4: %T", level)
return 0, 0
}
lz4Writer := lz4.NewWriter(&compressedBuf)

// Apply the compression level option
if err := lz4Writer.Apply(lz4.CompressionLevelOption(levelOpt)); err != nil {
log.Printf("Error applying lz4 compression level option (level %s): %v", levelOpt.String(), err)
return 0, 0
}

defer lz4Writer.Close() // Use defer
if _, err := lz4Writer.Write(data); err != nil {
log.Printf("Error compressing with lz4 (level %s): %v", levelOpt.String(), err)
return 0, 0
}
if err := lz4Writer.Close(); err != nil { // Close explicitly
log.Printf("Error closing lz4 writer (level %s): %v", levelOpt.String(), err)
return 0, 0
}
default:
log.Printf("Unsupported algorithm: %s", algorithm)
return 0, 0
}

duration := time.Since(startTime)
compressedSize := float64(compressedBuf.Len())
return compressedSize, duration
}

// calculateMedianDuration sorts the durations and returns the median.
func calculateMedianDuration(durations []time.Duration) time.Duration {
n := len(durations)
if n == 0 {
return 0
}
// Sort the durations
sort.Slice(durations, func(i, j int) bool {
return durations[i] < durations[j]
})
// Calculate median
if n%2 == 1 {
// Odd number of elements
return durations[n/2]
}
// Even number of elements
mid1 := durations[n/2-1]
mid2 := durations[n/2]
return (mid1 + mid2) / 2
}

// printResult updated to accept combined algorithm string.
func printResult(src, file, combinedAlgorithm string, originalSize, compressedSize float64, duration time.Duration) {
// Extract only the filename from the file path within the zip
fileName := filepath.Base(file)

if compressedSize == 0 || originalSize == 0 { // Also check originalSize
// Print NaN for ratio, speed, and utility
fmt.Printf("%s,%s,%s,NaN,NaN,NaN\n", filepath.Base(src), fileName, combinedAlgorithm)
return
}

compressionRatio := originalSize / compressedSize
var speedMBps float64
var utility float64

durationSeconds := duration.Seconds()
if durationSeconds > 0 {
speedMBps = (originalSize / (1024 * 1024)) / durationSeconds
// Handle potential NaN/Inf for utility if speedMBps is very small or ratio is huge
if math.IsNaN(compressionRatio) || math.IsInf(compressionRatio, 0) || math.IsNaN(speedMBps) || math.IsInf(speedMBps, 0) {
utility = math.NaN()
} else {
utility = compressionRatio * speedMBps
}
} else {
// If duration is zero or negative (unlikely but possible), speed is undefined.
speedMBps = math.Inf(1) // Represent as positive infinity
utility = math.NaN() // Utility is undefined
}

// Format output, handling potential NaN/Inf for utility
utilityStr := fmt.Sprintf("%.2f", utility)
if math.IsNaN(utility) {
utilityStr = "NaN"
} else if math.IsInf(utility, 1) {
utilityStr = "+Inf"
} else if math.IsInf(utility, -1) {
utilityStr = "-Inf"
}

// Use combinedAlgorithm string in the output
fmt.Printf("%s,%s,%s,%.2f,%.0f,%s\n",
filepath.Base(src),
fileName,
combinedAlgorithm,
compressionRatio,
speedMBps, // Already float64, formatted as %.0f
utilityStr)
}
Loading