using HyperBooru.Util; using Microsoft.EntityFrameworkCore; using System.Diagnostics; using System.Runtime.InteropServices; using System.Text.RegularExpressions; using Tesseract; namespace HyperBooru.Services; public class OcrService : IHostedService { private readonly TimeSpan ProcessInterval = TimeSpan.FromMinutes(30); private readonly TimeSpan StartupDelay = TimeSpan.FromSeconds(30); private readonly Regex SpaceRegex = new(@"[\s\n\r]+", RegexOptions.Compiled); private Task? task; private CancellationTokenSource cts = new(); private Timer timer; private IServiceScopeFactory scopeFactory; private ILogger logger; private IDbContextFactory dbFactory; public OcrService( IServiceScopeFactory scopeFactory, ILogger logger, IDbContextFactory dbFactory) { this.scopeFactory = scopeFactory; this.logger = logger; this.dbFactory = dbFactory; timer = new((object? state) => { if(task is not null && !task.IsCompleted) return; cts = new(); task = ProcessAllAsync(cts.Token); }); } public Task StartAsync(CancellationToken ct) { logger.LogInformation("Service starting..."); timer.Change(StartupDelay, ProcessInterval); return Task.CompletedTask; } public Task StopAsync(CancellationToken ct) { logger.LogInformation("Service stopping..."); timer.Change(Timeout.Infinite, Timeout.Infinite); cts.Cancel(); return Task.CompletedTask; } async Task ProcessAllAsync(CancellationToken ct) { using var scope = scopeFactory.CreateScope(); var mediaService = scope.ServiceProvider .GetRequiredService(); using var db = dbFactory.CreateDbContext(); Guid[] guids = db.Media .Include(m => m.OcrData) .Where(m => m.OcrData == null) .Select(m => m.Guid) .ToArray(); db.Dispose(); logger.LogInformation($"Performing OCR pass on {guids.Count()} media items"); var factory = new TaskFactory(new LimitedConcurrencyTaskScheduler()); var tasks = new List(); var stopwatch = Stopwatch.StartNew(); foreach(var guid in guids) tasks.Add(factory.StartNew(() => Process(guid, mediaService), ct)); await Task.WhenAll(tasks); stopwatch.Stop(); var time = stopwatch.Elapsed.ToStringHumanReadable(); logger.LogInformation( $"Performed OCR pass on {guids.Count()} media items in {time}"); } private void Process(Guid media, IMediaService mediaService) { logger.LogDebug($"Performing OCR on media item {media}"); using var db = dbFactory.CreateDbContext(); var m = db.Media .Include(m => m.OcrData) .First(m => m.Guid == media); OcrData o = m.OcrData ?? new(); using var engine = new TesseractEngine("tessdata", "eng", EngineMode.Default); using var image = Pix.LoadFromFile(mediaService.GetPath(m)); engine.SetVariable("debug_file", NullFile); o.Timestamp = DateTime.UtcNow; o.Text = engine.Process(image).GetText(); o.SearchableText = SpaceRegex.Replace(o.Text.ToLower(), " "); m.OcrData = o; db.SaveChanges(); } private string NullFile { get { if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) return "NUL"; else return "/dev/null"; } } }