From 4ea3ddb38d010c2f85c22b7f1c3f2d7e0c1355e3 Mon Sep 17 00:00:00 2001 From: Jake Mannens Date: Fri, 22 May 2026 12:46:00 +1000 Subject: Initial commit --- Services/OcrService.cs | 128 ------------------------------------------------- 1 file changed, 128 deletions(-) delete mode 100644 Services/OcrService.cs (limited to 'Services/OcrService.cs') diff --git a/Services/OcrService.cs b/Services/OcrService.cs deleted file mode 100644 index d43db2e..0000000 --- a/Services/OcrService.cs +++ /dev/null @@ -1,128 +0,0 @@ -using HyperBooru.Util; -using Microsoft.EntityFrameworkCore; -using System.Diagnostics; -using System.Runtime.InteropServices; -using System.Text.RegularExpressions; -using Tesseract; - -namespace HyperBooru.Services; - -public class OcrService : IHostedService { - private readonly string[] InvalidMimeTypes = [ "image/heic", "image/webp" ]; - - private readonly TimeSpan ProcessInterval = TimeSpan.FromMinutes(30); - private readonly TimeSpan StartupDelay = TimeSpan.FromSeconds(30); - - private readonly Regex SpaceRegex = new(@"[^0-9a-z]+", RegexOptions.Compiled); - - private Task? task; - private CancellationTokenSource cts = new(); - - private Timer timer; - - private IConfigService configService; - private IServiceScopeFactory scopeFactory; - private ILogger logger; - private IDbContextFactory dbFactory; - - public OcrService( - IConfigService configService, - IServiceScopeFactory scopeFactory, - ILogger logger, - IDbContextFactory dbFactory) { - - this.configService = configService; - this.scopeFactory = scopeFactory; - this.logger = logger; - this.dbFactory = dbFactory; - - timer = new((object? state) => { - if(task is not null && !task.IsCompleted) - return; - cts = new(); - task = ProcessAllAsync(cts.Token); - }); - } - - public Task StartAsync(CancellationToken ct) { - if(configService.EnableOcr) { - logger.LogInformation("Service starting..."); - timer.Change(StartupDelay, ProcessInterval); - } - - return Task.CompletedTask; - } - - public Task StopAsync(CancellationToken ct) { - logger.LogInformation("Service stopping..."); - timer.Change(Timeout.Infinite, Timeout.Infinite); - cts.Cancel(); - return Task.CompletedTask; - } - - async Task ProcessAllAsync(CancellationToken ct) { - using var scope = scopeFactory.CreateScope(); - var mediaService = scope.ServiceProvider - .GetRequiredService(); - - using var db = dbFactory.CreateDbContext(); - Guid[] guids = db.Media - .AsNoTracking() - .Include(m => m.CurrentUploadedFile) - .Include(m => m.OcrData) - .Where(m => m.OcrData == null) - .Where(m => m.CurrentUploadedFile!.MimeType.Contains("image/")) - .Where(m => !InvalidMimeTypes.Contains(m.CurrentUploadedFile!.MimeType)) - .Select(m => m.Guid) - .ToArray(); - db.Dispose(); - - logger.LogInformation($"Performing OCR pass on {guids.Count()} media items"); - - var factory = new TaskFactory(new LimitedConcurrencyTaskScheduler()); - var tasks = new List(); - - var stopwatch = Stopwatch.StartNew(); - - foreach(var guid in guids) - tasks.Add(factory.StartNew(() => Process(guid, mediaService), ct)); - - await Task.WhenAll(tasks); - stopwatch.Stop(); - - var time = stopwatch.Elapsed.ToStringHumanReadable(); - logger.LogInformation( - $"Performed OCR pass on {guids.Count()} media items in {time}"); - } - - private void Process(Guid media, IMediaService mediaService) { - logger.LogDebug($"Performing OCR on media item {media}"); - - using var db = dbFactory.CreateDbContext(); - var m = db.Media - .Include(m => m.OcrData) - .First(m => m.Guid == media); - - OcrData o = m.OcrData ?? new(); - - using var engine = new TesseractEngine("tessdata", "eng", EngineMode.Default); - using var image = Pix.LoadFromFile(mediaService.GetPath(m)); - engine.SetVariable("debug_file", NullFile); - - o.Timestamp = DateTime.UtcNow; - o.Text = engine.Process(image).GetText().Trim(); - o.SearchableText = SpaceRegex.Replace(o.Text.ToLower(), " ").Trim(); - - m.OcrData = o; - db.SaveChanges(); - } - - private string NullFile { - get { - if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - return "NUL"; - else - return "/dev/null"; - } - } -} -- cgit v1.3