Skip to content

Commit d16df19

Browse files
authored
Merge pull request #109 from evan-cao-wb/add-PdfToTextConverter
MR for PdfToTextConverter
2 parents 7e95be0 + 780bb4e commit d16df19

File tree

14 files changed

+382
-40
lines changed

14 files changed

+382
-40
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace BotSharp.Abstraction.Knowledges
6+
{
7+
public interface IPaddleOcrConverter
8+
{
9+
// void LoadModel();
10+
Task<string> ConvertImageToText(string loadPath);
11+
}
12+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
using Microsoft.AspNetCore.Http;
5+
6+
namespace BotSharp.Abstraction.Knowledges
7+
{
8+
public interface IPdf2TextConverter
9+
{
10+
Task<string> ConvertPdfToText(IFormFile formFile, int? startPageNum, int? endPageNum);
11+
}
12+
}

src/Infrastructure/BotSharp.Core/BotSharp.Core.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
<PackageReference Include="EntityFrameworkCore.BootKit" Version="6.2.1" />
7777
<PackageReference Include="Fluid.Core" Version="2.4.0" />
7878
<PackageReference Include="TensorFlow.Keras" Version="0.11.2" />
79+
<PackageReference Include="PdfPig" Version="0.1.9-alpha-20230806-4a480" />
7980
</ItemGroup>
8081

8182
<ItemGroup>

src/Infrastructure/BotSharp.Core/BotSharpServiceCollectionExtensions.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using BotSharp.Core.Functions;
33
using BotSharp.Core.Hooks;
44
using BotSharp.Core.Templating;
5+
using BotSharp.Core.Plugins.Knowledges.Services;
56
using Microsoft.AspNetCore.Builder;
67
using Microsoft.Extensions.Configuration;
78

@@ -95,5 +96,7 @@ public static void RegisterPlugins(IServiceCollection services, IConfiguration c
9596
loader.Load();
9697

9798
services.AddSingleton(loader);
99+
100+
services.AddSingleton<IPdf2TextConverter, PigPdf2TextConverter>();
98101
}
99102
}

src/Infrastructure/BotSharp.Core/Plugins/Knowledges/KnowledgeBaseSettings.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ public class KnowledgeBaseSettings
55
public string VectorDb { get; set; }
66
public string TextEmbedding { get; set; }
77
public string TextCompletion { get; set; }
8+
public string Pdf2TextConverter { get; set; }
89
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Text;
5+
using Microsoft.AspNetCore.Http;
6+
using UglyToad.PdfPig;
7+
using UglyToad.PdfPig.Content;
8+
9+
namespace BotSharp.Core.Plugins.Knowledges.Services;
10+
11+
public class PigPdf2TextConverter : IPdf2TextConverter
12+
{
13+
public async Task<string> ConvertPdfToText(IFormFile formFile, int? startPageNum, int? endPageNum)
14+
{
15+
return await OpenPdfDocumentAsync(formFile, startPageNum, endPageNum);
16+
}
17+
18+
private async Task<string> OpenPdfDocumentAsync(IFormFile formFile, int? startPageNum, int? endPageNum)
19+
{
20+
if (formFile.Length <= 0)
21+
{
22+
return await Task.FromResult(string.Empty);
23+
}
24+
25+
var filePath = Path.GetTempFileName();
26+
27+
using (var stream = System.IO.File.Create(filePath))
28+
{
29+
await formFile.CopyToAsync(stream);
30+
}
31+
32+
var document = PdfDocument.Open(filePath);
33+
var content = "";
34+
foreach (Page page in document.GetPages())
35+
{
36+
if (startPageNum.HasValue && page.Number < startPageNum.Value)
37+
{
38+
continue;
39+
}
40+
41+
if (endPageNum.HasValue && page.Number > endPageNum.Value)
42+
{
43+
continue;
44+
}
45+
content += page.Text;
46+
}
47+
return content;
48+
}
49+
}

src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
44
<TargetFramework>net6.0</TargetFramework>
@@ -9,11 +9,11 @@
99

1010
<ItemGroup>
1111
<PackageReference Include="Microsoft.AspNetCore.Mvc.Core" Version="2.2.5" />
12-
<PackageReference Include="PdfPig" Version="0.1.8" />
1312
</ItemGroup>
1413

1514
<ItemGroup>
1615
<ProjectReference Include="..\BotSharp.Abstraction\BotSharp.Abstraction.csproj" />
16+
<ProjectReference Include="..\BotSharp.Core\BotSharp.Core.csproj" />
1717
</ItemGroup>
1818

1919
</Project>

src/Infrastructure/BotSharp.OpenAPI/Controllers/KnowledgeController.cs

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
using Microsoft.AspNetCore.Http;
44
using UglyToad.PdfPig.Content;
55
using UglyToad.PdfPig;
6+
using BotSharp.Core.Plugins.Knowledges;
7+
68

79
namespace BotSharp.OpenAPI.Controllers;
810

@@ -11,11 +13,13 @@ namespace BotSharp.OpenAPI.Controllers;
1113
public class KnowledgeController : ControllerBase, IApiAdapter
1214
{
1315
private readonly IKnowledgeService _knowledgeService;
14-
public KnowledgeController(IKnowledgeService knowledgeService)
16+
private readonly IServiceProvider _services;
17+
18+
public KnowledgeController(IKnowledgeService knowledgeService, IServiceProvider services)
1519
{
1620
_knowledgeService = knowledgeService;
21+
_services = services;
1722
}
18-
1923
[HttpGet("/knowledge/{agentId}")]
2024
public async Task<List<RetrievedResult>> RetrieveKnowledge([FromRoute] string agentId, [FromQuery(Name = "q")] string question)
2125
{
@@ -27,44 +31,22 @@ public async Task<List<RetrievedResult>> RetrieveKnowledge([FromRoute] string ag
2731
}
2832

2933
[HttpPost("/knowledge/{agentId}")]
30-
public async Task<IActionResult> FeedKnowledge([FromRoute] string agentId, List<IFormFile> files, [FromQuery] int? startPageNum, [FromQuery] int? endPageNum)
34+
public async Task<IActionResult> FeedKnowledge([FromRoute] string agentId, List<IFormFile> files, [FromQuery] int? startPageNum, [FromQuery] int? endPageNum, [FromQuery] bool? paddleModel)
3135
{
36+
var setttings = _services.GetRequiredService<KnowledgeBaseSettings>();
37+
var textConverter = _services.GetServices<IPdf2TextConverter>().First(x => x.GetType().FullName.EndsWith(setttings.Pdf2TextConverter));
3238
long size = files.Sum(f => f.Length);
3339

3440
foreach (var formFile in files)
3541
{
36-
if (formFile.Length <= 0)
37-
{
38-
continue;
39-
}
40-
41-
var filePath = Path.GetTempFileName();
42-
43-
using (var stream = System.IO.File.Create(filePath))
44-
{
45-
await formFile.CopyToAsync(stream);
46-
}
47-
48-
var document = PdfDocument.Open(filePath);
4942
var content = "";
50-
foreach (Page page in document.GetPages())
51-
{
52-
if (startPageNum.HasValue && page.Number < startPageNum.Value)
53-
{
54-
continue;
55-
}
56-
57-
if (endPageNum.HasValue && page.Number > endPageNum.Value)
58-
{
59-
continue;
60-
}
6143

62-
content += page.Text;
63-
}
44+
content = await textConverter.ConvertPdfToText(formFile, startPageNum, endPageNum);
6445

6546
// Process uploaded files
6647
// Don't rely on or trust the FileName property without validation.
6748

49+
// Add FeedWithMetaData
6850
await _knowledgeService.Feed(new KnowledgeFeedModel
6951
{
7052
AgentId = agentId,

src/Plugins/BotSharp.Plugin.PaddleSharp/BotSharp.Plugin.PaddleSharp.csproj

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,15 @@
88
</PropertyGroup>
99

1010
<ItemGroup>
11+
<PackageReference Include="Docnet.Core" Version="2.5.0-alpha.1" />
12+
<PackageReference Include="Magick.NET-Q16-AnyCPU" Version="13.2.0" />
13+
<PackageReference Include="Magick.NET.Core" Version="13.2.0" />
1114
<PackageReference Include="OpenCvSharp4.runtime.win" Version="4.7.0.20230115" />
1215
<PackageReference Include="Sdcb.PaddleInference" Version="2.4.1.3" />
16+
<PackageReference Include="Sdcb.PaddleInference.runtime.win64.mkl" Version="2.5.1" />
1317
<PackageReference Include="Sdcb.PaddleOCR" Version="2.6.0.5" />
1418
<PackageReference Include="Sdcb.PaddleOCR.Models.LocalV3" Version="2.6.0.5" />
19+
<PackageReference Include="System.Drawing.Common" Version="8.0.0-preview.7.23375.5" />
1520
</ItemGroup>
1621

1722
<ItemGroup>

src/Plugins/BotSharp.Plugin.PaddleSharp/PaddleSharpPlugin.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
using BotSharp.Abstraction.Knowledges;
12
using BotSharp.Abstraction.Plugins;
3+
using BotSharp.Plugin.PaddleSharp.Providers;
4+
using BotSharp.Plugin.PaddleSharp.Settings;
25
using Microsoft.Extensions.Configuration;
36
using Microsoft.Extensions.DependencyInjection;
47
using System;
@@ -9,6 +12,9 @@ public class PaddleSharpPlugin : IBotSharpPlugin
912
{
1013
public void RegisterDI(IServiceCollection services, IConfiguration config)
1114
{
12-
15+
var settings = new PaddleSharpSettings();
16+
config.Bind("PaddleSharp", settings);
17+
services.AddSingleton(x => settings);
18+
services.AddSingleton<IPdf2TextConverter, Pdf2TextConverter>();
1319
}
1420
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Text;
5+
using Sdcb.PaddleOCR;
6+
using Sdcb.PaddleOCR.Models;
7+
using Sdcb.PaddleInference;
8+
using Sdcb.PaddleOCR.Models.LocalV3;
9+
using OpenCvSharp;
10+
using System.Threading.Tasks;
11+
using BotSharp.Abstraction.Knowledges;
12+
using BotSharp.Plugin.PaddleSharp.Settings;
13+
14+
namespace BotSharp.Plugin.PaddleSharp.Providers;
15+
16+
public class PaddleOcrConverter : IPaddleOcrConverter
17+
{
18+
private FullOcrModel _paddleFullOcrmodel;
19+
private QueuedPaddleOcrAll _allModel;
20+
private readonly PaddleSharpSettings _paddleSharpSettings;
21+
22+
public PaddleOcrConverter(FullOcrModel paddleFullOcrmodel, QueuedPaddleOcrAll allModel, PaddleSharpSettings paddleSharpSettings)
23+
{
24+
_paddleFullOcrmodel = paddleFullOcrmodel;
25+
_allModel = allModel;
26+
_paddleSharpSettings = paddleSharpSettings;
27+
}
28+
29+
private void LoadModel()
30+
{
31+
_allModel = new(() => new PaddleOcrAll(_paddleFullOcrmodel, _paddleSharpSettings.device)
32+
{
33+
AllowRotateDetection = _paddleSharpSettings.allowRotateDetection,
34+
Enable180Classification = _paddleSharpSettings.enable180Classification,
35+
}, consumerCount: _paddleSharpSettings.consumerCount, boundedCapacity: _paddleSharpSettings.boundedCapacity);
36+
}
37+
38+
private void DisposeModel()
39+
{
40+
_allModel.Dispose();
41+
}
42+
43+
public async Task<string> ConvertImageToText(string loadPath)
44+
{
45+
_allModel = new(() => new PaddleOcrAll(_paddleFullOcrmodel, _paddleSharpSettings.device)
46+
{
47+
AllowRotateDetection = _paddleSharpSettings.allowRotateDetection,
48+
Enable180Classification = _paddleSharpSettings.enable180Classification,
49+
}, consumerCount: _paddleSharpSettings.consumerCount, boundedCapacity: _paddleSharpSettings.boundedCapacity);
50+
51+
var contents = "";
52+
using (Mat src = Cv2.ImRead(loadPath))
53+
{
54+
PaddleOcrResult result = await _allModel.Run(src);
55+
56+
foreach (PaddleOcrResultRegion region in result.Regions)
57+
{
58+
if (region.Score > _paddleSharpSettings.acceptScore)
59+
{
60+
contents += region.Text + " ";
61+
}
62+
}
63+
}
64+
65+
_allModel.Dispose();
66+
return contents;
67+
}
68+
}
69+
*/

0 commit comments

Comments
 (0)