summaryrefslogtreecommitdiff
path: root/Services/OcrService.cs
diff options
context:
space:
mode:
authorJake Mannens <jake@asger.xyz>2023-09-01 13:03:57 +1000
committerJake Mannens <jake@asger.xyz>2025-08-20 00:48:44 +1000
commitb286a0b0f1fcdb511d2dbb8886039cfb0182c89b (patch)
tree83d8d8ec7a575c36f22bfab01fa2708881c3f086 /Services/OcrService.cs
parent5b93706343484914370a87fddea29874c8156321 (diff)
Merged OCR functionality
Diffstat (limited to 'Services/OcrService.cs')
-rw-r--r--Services/OcrService.cs117
1 files changed, 117 insertions, 0 deletions
diff --git a/Services/OcrService.cs b/Services/OcrService.cs
new file mode 100644
index 0000000..2f65e43
--- /dev/null
+++ b/Services/OcrService.cs
@@ -0,0 +1,117 @@
+using HyperBooru.Util;
+using Microsoft.EntityFrameworkCore;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+using System.Text.RegularExpressions;
+using Tesseract;
+
+namespace HyperBooru.Services;
+
+public class OcrService : IHostedService {
+ private readonly TimeSpan ProcessInterval = TimeSpan.FromMinutes(30);
+ private readonly TimeSpan StartupDelay = TimeSpan.FromSeconds(30);
+
+ private readonly Regex SpaceRegex = new(@"[^0-9a-z]+", RegexOptions.Compiled);
+
+ private Task? task;
+ private CancellationTokenSource cts = new();
+
+ private Timer timer;
+
+ private IServiceScopeFactory scopeFactory;
+ private ILogger<OcrService> logger;
+ private IDbContextFactory<HBContext> dbFactory;
+
+ public OcrService(
+ IServiceScopeFactory scopeFactory,
+ ILogger<OcrService> logger,
+ IDbContextFactory<HBContext> dbFactory) {
+
+ this.scopeFactory = scopeFactory;
+ this.logger = logger;
+ this.dbFactory = dbFactory;
+
+ timer = new((object? state) => {
+ if(task is not null && !task.IsCompleted)
+ return;
+ cts = new();
+ task = ProcessAllAsync(cts.Token);
+ });
+ }
+
+ public Task StartAsync(CancellationToken ct) {
+ logger.LogInformation("Service starting...");
+ timer.Change(StartupDelay, ProcessInterval);
+ return Task.CompletedTask;
+ }
+
+ public Task StopAsync(CancellationToken ct) {
+ logger.LogInformation("Service stopping...");
+ timer.Change(Timeout.Infinite, Timeout.Infinite);
+ cts.Cancel();
+ return Task.CompletedTask;
+ }
+
+ async Task ProcessAllAsync(CancellationToken ct) {
+ using var scope = scopeFactory.CreateScope();
+ var mediaService = scope.ServiceProvider
+ .GetRequiredService<IMediaService>();
+
+ using var db = dbFactory.CreateDbContext();
+ Guid[] guids = db.Media
+ .Include(m => m.OcrData)
+ .Where(m => m.OcrData == null)
+ .Where(m => m.MimeType.Contains("image/"))
+ .Select(m => m.Guid)
+ .ToArray();
+ db.Dispose();
+
+ logger.LogInformation($"Performing OCR pass on {guids.Count()} media items");
+
+ var factory = new TaskFactory(new LimitedConcurrencyTaskScheduler());
+ var tasks = new List<Task>();
+
+ var stopwatch = Stopwatch.StartNew();
+
+ foreach(var guid in guids)
+ tasks.Add(factory.StartNew(() => Process(guid, mediaService), ct));
+
+ await Task.WhenAll(tasks);
+ stopwatch.Stop();
+
+ var time = stopwatch.Elapsed.ToStringHumanReadable();
+ logger.LogInformation(
+ $"Performed OCR pass on {guids.Count()} media items in {time}");
+ }
+
+ private void Process(Guid media, IMediaService mediaService) {
+ logger.LogDebug($"Performing OCR on media item {media}");
+
+ using var db = dbFactory.CreateDbContext();
+ var m = db.Media
+ .Include(m => m.OcrData)
+ .First(m => m.Guid == media);
+
+ OcrData o = m.OcrData ?? new();
+
+ using var engine = new TesseractEngine("tessdata", "eng", EngineMode.Default);
+ using var image = Pix.LoadFromFile(mediaService.GetPath(m));
+ engine.SetVariable("debug_file", NullFile);
+
+ o.Timestamp = DateTime.UtcNow;
+ o.Text = engine.Process(image).GetText().Trim();
+ o.SearchableText = SpaceRegex.Replace(o.Text.ToLower(), " ").Trim();
+
+ m.OcrData = o;
+ db.SaveChanges();
+ }
+
+ private string NullFile {
+ get {
+ if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+ return "NUL";
+ else
+ return "/dev/null";
+ }
+ }
+}