diff options
| -rw-r--r-- | Controllers/MediaController.cs | 3 | ||||
| -rw-r--r-- | HBContext.cs | 1 | ||||
| -rw-r--r-- | Media.cs | 15 | ||||
| -rw-r--r-- | Migrations/20230831162159_MediaOcr.Designer.cs | 310 | ||||
| -rw-r--r-- | Migrations/20230831162159_MediaOcr.cs | 51 | ||||
| -rw-r--r-- | Migrations/HBContextModelSnapshot.cs | 43 | ||||
| -rw-r--r-- | Pages/TagDefinitions.razor | 7 | ||||
| -rw-r--r-- | Pages/ViewMedia.razor | 26 | ||||
| -rw-r--r-- | Program.cs | 3 | ||||
| -rw-r--r-- | Server.csproj | 2 | ||||
| -rw-r--r-- | Services/OcrService.cs | 116 | ||||
| -rw-r--r-- | Services/SearchService.cs | 60 | ||||
| -rw-r--r-- | Todo.md | 5 | ||||
| -rw-r--r-- | Util.cs | 99 | ||||
| -rw-r--r-- | appsettings.Development.json | 3 | ||||
| -rw-r--r-- | tessdata/eng.traineddata | bin | 0 -> 15400601 bytes | |||
| -rw-r--r-- | wwwroot/styles/global.css | 11 |
17 files changed, 728 insertions, 27 deletions
diff --git a/Controllers/MediaController.cs b/Controllers/MediaController.cs index 85dfc65..8070199 100644 --- a/Controllers/MediaController.cs +++ b/Controllers/MediaController.cs @@ -1,9 +1,6 @@ using HyperBooru.Services; using HyperBooru.Util; -using ImageMagick; using Microsoft.AspNetCore.Mvc; -using MimeDetective; -using System.Security.Cryptography; namespace HyperBooru.Controllers; diff --git a/HBContext.cs b/HBContext.cs index f6bc15c..15dad6d 100644 --- a/HBContext.cs +++ b/HBContext.cs @@ -15,6 +15,7 @@ public class HBContext : DbContext { public DbSet<Tag> Tags { get; set; } public DbSet<Media> Media { get; set; } public DbSet<UploadedFile> UploadedFiles { get; set; } + public DbSet<OcrData> OcrData { get; set; } private IConfigService config; @@ -13,6 +13,7 @@ public class Media : HBObject { public string? LongDescription { get; set; } public int Width { get; set; } public int Height { get; set; } + public virtual OcrData? OcrData { get; set; } public virtual List<UploadedFile> UploadedFiles { get; set; } = new(); public bool IsIngest => Tags @@ -26,7 +27,7 @@ public class Media : HBObject { return UploadedFiles .OrderBy(f => f.UploadTime) - .First()?.Filename; + .First()?.Filename ?? Guid.ToString().ToUpper(); } } } @@ -40,4 +41,16 @@ public class UploadedFile : HBObject { public DateTime? LastWriteTime { get; set; } public DateTime? CreateTime { get; set; } public virtual Media Media { get; set; } +} + +public class OcrData { + [Key] + [DatabaseGenerated(DatabaseGeneratedOption.Identity)] + public int OcrDataId { get; set; } + [ForeignKey("ObjectId")] + public int MediaId { get; set; } + public string Text { get; set; } + public string SearchableText { get; set; } + public DateTime Timestamp { get; set; } + public virtual Media Media { get; set; } }
\ No newline at end of file diff --git a/Migrations/20230831162159_MediaOcr.Designer.cs b/Migrations/20230831162159_MediaOcr.Designer.cs new file mode 100644 index 0000000..866f7ed --- /dev/null +++ b/Migrations/20230831162159_MediaOcr.Designer.cs @@ -0,0 +1,310 @@ +// <auto-generated /> +using System; +using HyperBooru; +using Microsoft.EntityFrameworkCore; +using Microsoft.EntityFrameworkCore.Infrastructure; +using Microsoft.EntityFrameworkCore.Migrations; +using Microsoft.EntityFrameworkCore.Storage.ValueConversion; +using Npgsql.EntityFrameworkCore.PostgreSQL.Metadata; + +#nullable disable + +namespace HyperBooru.Migrations +{ + [DbContext(typeof(HBContext))] + [Migration("20230831162159_MediaOcr")] + partial class MediaOcr + { + /// <inheritdoc /> + protected override void BuildTargetModel(ModelBuilder modelBuilder) + { +#pragma warning disable 612, 618 + modelBuilder + .HasAnnotation("ProductVersion", "7.0.10") + .HasAnnotation("Relational:MaxIdentifierLength", 63); + + NpgsqlModelBuilderExtensions.UseIdentityByDefaultColumns(modelBuilder); + + modelBuilder.Entity("HyperBooru.HBObject", b => + { + b.Property<int>("ObjectId") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property<int>("ObjectId")); + + b.Property<Guid>("Guid") + .HasColumnType("uuid"); + + b.HasKey("ObjectId"); + + b.HasIndex("Guid"); + + b.ToTable("Objects", (string)null); + + b.UseTptMappingStrategy(); + }); + + modelBuilder.Entity("HyperBooru.OcrData", b => + { + b.Property<int>("OcrDataId") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property<int>("OcrDataId")); + + b.Property<int>("MediaId") + .HasColumnType("integer"); + + b.Property<string>("SearchableText") + .IsRequired() + .HasColumnType("text"); + + b.Property<string>("Text") + .IsRequired() + .HasColumnType("text"); + + b.Property<DateTime>("Timestamp") + .HasColumnType("timestamp with time zone"); + + b.HasKey("OcrDataId"); + + b.HasIndex("MediaId") + .IsUnique(); + + b.ToTable("OcrData"); + }); + + modelBuilder.Entity("TagDefinitionTagDefinition", b => + { + b.Property<int>("ImplicitTagsObjectId") + .HasColumnType("integer"); + + b.Property<int>("TagDefinitionObjectId") + .HasColumnType("integer"); + + b.HasKey("ImplicitTagsObjectId", "TagDefinitionObjectId"); + + b.HasIndex("TagDefinitionObjectId"); + + b.ToTable("TagDefinitionTagDefinition"); + }); + + modelBuilder.Entity("HyperBooru.Media", b => + { + b.HasBaseType("HyperBooru.HBObject"); + + b.Property<string>("Checksum") + .IsRequired() + .HasColumnType("text"); + + b.Property<int>("Height") + .HasColumnType("integer"); + + b.Property<string>("LongDescription") + .HasColumnType("text"); + + b.Property<string>("MimeType") + .IsRequired() + .HasColumnType("text"); + + b.Property<string>("ShortDescription") + .HasColumnType("text"); + + b.Property<int>("Width") + .HasColumnType("integer"); + + b.ToTable("Media", (string)null); + }); + + modelBuilder.Entity("HyperBooru.Tag", b => + { + b.HasBaseType("HyperBooru.HBObject"); + + b.Property<DateTime>("CreateTime") + .HasColumnType("timestamp with time zone"); + + b.Property<int>("TagDefinitionId") + .HasColumnType("integer"); + + b.Property<int>("TargetObjectId") + .HasColumnType("integer"); + + b.HasIndex("TagDefinitionId"); + + b.HasIndex("TargetObjectId"); + + b.ToTable("Tags", (string)null); + }); + + modelBuilder.Entity("HyperBooru.TagDefinition", b => + { + b.HasBaseType("HyperBooru.HBObject"); + + b.Property<string>("Alias") + .HasColumnType("text"); + + b.Property<string>("Name") + .IsRequired() + .HasColumnType("text"); + + b.Property<string>("Namespace") + .HasColumnType("text"); + + b.Property<int>("Source") + .HasColumnType("integer"); + + b.ToTable("TagDefinitions", (string)null); + + b.HasData( + new + { + ObjectId = -1, + Guid = new Guid("ebdad4f8-455a-4351-8017-1d4854d6fa38"), + Name = "nsfw", + Source = 0 + }, + new + { + ObjectId = -2, + Guid = new Guid("ea212801-5bcc-4c0e-814f-fb9d30db58bc"), + Name = "ingest", + Source = 0 + }); + }); + + modelBuilder.Entity("HyperBooru.UploadedFile", b => + { + b.HasBaseType("HyperBooru.HBObject"); + + b.Property<DateTime?>("CreateTime") + .HasColumnType("timestamp with time zone"); + + b.Property<string>("Filename") + .HasColumnType("text"); + + b.Property<DateTime?>("LastAccessTime") + .HasColumnType("timestamp with time zone"); + + b.Property<DateTime?>("LastWriteTime") + .HasColumnType("timestamp with time zone"); + + b.Property<long>("Length") + .HasColumnType("bigint"); + + b.Property<int>("MediaObjectId") + .HasColumnType("integer"); + + b.Property<string>("OriginalChecksum") + .IsRequired() + .HasColumnType("text"); + + b.Property<DateTime>("UploadTime") + .HasColumnType("timestamp with time zone"); + + b.HasIndex("MediaObjectId"); + + b.ToTable("UploadedFiles", (string)null); + }); + + modelBuilder.Entity("HyperBooru.OcrData", b => + { + b.HasOne("HyperBooru.Media", "Media") + .WithOne("OcrData") + .HasForeignKey("HyperBooru.OcrData", "MediaId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("Media"); + }); + + modelBuilder.Entity("TagDefinitionTagDefinition", b => + { + b.HasOne("HyperBooru.TagDefinition", null) + .WithMany() + .HasForeignKey("ImplicitTagsObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("HyperBooru.TagDefinition", null) + .WithMany() + .HasForeignKey("TagDefinitionObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + }); + + modelBuilder.Entity("HyperBooru.Media", b => + { + b.HasOne("HyperBooru.HBObject", null) + .WithOne() + .HasForeignKey("HyperBooru.Media", "ObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + }); + + modelBuilder.Entity("HyperBooru.Tag", b => + { + b.HasOne("HyperBooru.HBObject", null) + .WithOne() + .HasForeignKey("HyperBooru.Tag", "ObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("HyperBooru.TagDefinition", "TagDefinition") + .WithMany() + .HasForeignKey("TagDefinitionId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("HyperBooru.HBObject", "Target") + .WithMany("Tags") + .HasForeignKey("TargetObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("TagDefinition"); + + b.Navigation("Target"); + }); + + modelBuilder.Entity("HyperBooru.TagDefinition", b => + { + b.HasOne("HyperBooru.HBObject", null) + .WithOne() + .HasForeignKey("HyperBooru.TagDefinition", "ObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + }); + + modelBuilder.Entity("HyperBooru.UploadedFile", b => + { + b.HasOne("HyperBooru.Media", "Media") + .WithMany("UploadedFiles") + .HasForeignKey("MediaObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.HasOne("HyperBooru.HBObject", null) + .WithOne() + .HasForeignKey("HyperBooru.UploadedFile", "ObjectId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("Media"); + }); + + modelBuilder.Entity("HyperBooru.HBObject", b => + { + b.Navigation("Tags"); + }); + + modelBuilder.Entity("HyperBooru.Media", b => + { + b.Navigation("OcrData"); + + b.Navigation("UploadedFiles"); + }); +#pragma warning restore 612, 618 + } + } +} diff --git a/Migrations/20230831162159_MediaOcr.cs b/Migrations/20230831162159_MediaOcr.cs new file mode 100644 index 0000000..02f6185 --- /dev/null +++ b/Migrations/20230831162159_MediaOcr.cs @@ -0,0 +1,51 @@ +using System; +using Microsoft.EntityFrameworkCore.Migrations; +using Npgsql.EntityFrameworkCore.PostgreSQL.Metadata; + +#nullable disable + +namespace HyperBooru.Migrations +{ + /// <inheritdoc /> + public partial class MediaOcr : Migration + { + /// <inheritdoc /> + protected override void Up(MigrationBuilder migrationBuilder) + { + migrationBuilder.CreateTable( + name: "OcrData", + columns: table => new + { + OcrDataId = table.Column<int>(type: "integer", nullable: false) + .Annotation("Npgsql:ValueGenerationStrategy", NpgsqlValueGenerationStrategy.IdentityByDefaultColumn), + MediaId = table.Column<int>(type: "integer", nullable: false), + Text = table.Column<string>(type: "text", nullable: false), + SearchableText = table.Column<string>(type: "text", nullable: false), + Timestamp = table.Column<DateTime>(type: "timestamp with time zone", nullable: false) + }, + constraints: table => + { + table.PrimaryKey("PK_OcrData", x => x.OcrDataId); + table.ForeignKey( + name: "FK_OcrData_Media_MediaId", + column: x => x.MediaId, + principalTable: "Media", + principalColumn: "ObjectId", + onDelete: ReferentialAction.Cascade); + }); + + migrationBuilder.CreateIndex( + name: "IX_OcrData_MediaId", + table: "OcrData", + column: "MediaId", + unique: true); + } + + /// <inheritdoc /> + protected override void Down(MigrationBuilder migrationBuilder) + { + migrationBuilder.DropTable( + name: "OcrData"); + } + } +} diff --git a/Migrations/HBContextModelSnapshot.cs b/Migrations/HBContextModelSnapshot.cs index 06b3e20..72b662f 100644 --- a/Migrations/HBContextModelSnapshot.cs +++ b/Migrations/HBContextModelSnapshot.cs @@ -42,6 +42,36 @@ namespace HyperBooru.Migrations b.UseTptMappingStrategy(); }); + modelBuilder.Entity("HyperBooru.OcrData", b => + { + b.Property<int>("OcrDataId") + .ValueGeneratedOnAdd() + .HasColumnType("integer"); + + NpgsqlPropertyBuilderExtensions.UseIdentityByDefaultColumn(b.Property<int>("OcrDataId")); + + b.Property<int>("MediaId") + .HasColumnType("integer"); + + b.Property<string>("SearchableText") + .IsRequired() + .HasColumnType("text"); + + b.Property<string>("Text") + .IsRequired() + .HasColumnType("text"); + + b.Property<DateTime>("Timestamp") + .HasColumnType("timestamp with time zone"); + + b.HasKey("OcrDataId"); + + b.HasIndex("MediaId") + .IsUnique(); + + b.ToTable("OcrData"); + }); + modelBuilder.Entity("TagDefinitionTagDefinition", b => { b.Property<int>("ImplicitTagsObjectId") @@ -174,6 +204,17 @@ namespace HyperBooru.Migrations b.ToTable("UploadedFiles", (string)null); }); + modelBuilder.Entity("HyperBooru.OcrData", b => + { + b.HasOne("HyperBooru.Media", "Media") + .WithOne("OcrData") + .HasForeignKey("HyperBooru.OcrData", "MediaId") + .OnDelete(DeleteBehavior.Cascade) + .IsRequired(); + + b.Navigation("Media"); + }); + modelBuilder.Entity("TagDefinitionTagDefinition", b => { b.HasOne("HyperBooru.TagDefinition", null) @@ -256,6 +297,8 @@ namespace HyperBooru.Migrations modelBuilder.Entity("HyperBooru.Media", b => { + b.Navigation("OcrData"); + b.Navigation("UploadedFiles"); }); #pragma warning restore 612, 618 diff --git a/Pages/TagDefinitions.razor b/Pages/TagDefinitions.razor index f5339e7..f32e803 100644 --- a/Pages/TagDefinitions.razor +++ b/Pages/TagDefinitions.razor @@ -43,12 +43,7 @@ @(", ") } } - -@* @(string.Join(", ", tagDef.ImplicitTags - .Where(it => it.Source == TagSource.UserTag) - .Select(it => it.Name) - .Order())) -*@ </i> + </i> </td> <td class="actions"> <a href="javascript:;" @onclick=@(() => PromptToEdit(tagDef))>Edit</a> diff --git a/Pages/ViewMedia.razor b/Pages/ViewMedia.razor index bb6a207..eb49b15 100644 --- a/Pages/ViewMedia.razor +++ b/Pages/ViewMedia.razor @@ -62,17 +62,18 @@ <ButtonContainer> <button @onclick=@(() => deleteDialog.Show()) class="warning">Delete</button> <button @onclick=@(() => tagDialog.Show()) class="secondary">Add Tag</button> - @if(media.IsIngest) { - <button @onclick=@(() => SetIngest(false))>Mark Tagging Complete</button> - } else { - <button class="secondary" @onclick=@(() => SetIngest(true))>Mark Tagging Incomplete</button> - } + <button @onclick=@(() => ocrDialog.Show()) class="secondary">View OCR</button> @if(infoEditMode) { <button @onclick=@(() => ApplyInfoEdit(false)) class="secondary">Cancel</button> <button @onclick=@(() => ApplyInfoEdit(true))>Apply</button> } else { <button @onclick=@(() => InfoEditMode = true) class="secondary">Edit Info</button> } + @if(media.IsIngest) { + <button @onclick=@(() => SetIngest(false))>Mark Tagging Complete</button> + } else { + <button class="secondary" @onclick=@(() => SetIngest(true))>Mark Tagging Incomplete</button> + } </ButtonContainer> </div> </div> @@ -85,6 +86,17 @@ </ButtonContainer> </Dialog> +<Dialog Title="OCR Data" @ref=ocrDialog> + @if(media.OcrData is null) { + <p><center>This media item hasn't been scanned yet!</center></p> + } else { + <code style="max-height:400px;">@media.OcrData?.Text</code> + } + <ButtonContainer> + <button @onclick=@(() => ocrDialog.Hide())>Close</button> + </ButtonContainer> +</Dialog> + <TagSelectDialog Title="Select one or more tag(s) to add" OnSubmit=AddTags @@ -103,9 +115,10 @@ private string? shortDescription; private string? longDescription; + private MediaTagTable mediaTagTable; private Dialog deleteDialog; + private Dialog ocrDialog; private TagSelectDialog tagDialog; - private MediaTagTable mediaTagTable; private HBContext db; @@ -119,6 +132,7 @@ .Include(m => m.Tags) .ThenInclude(t => t.TagDefinition) .Include(m => m.UploadedFiles) + .Include(m => m.OcrData) .First(m => m.Guid == MediaId); title = media.DisplayName ?? "Media View"; @@ -14,13 +14,14 @@ public class Program { builder.Services.AddRazorPages(); builder.Services.AddServerSideBlazor(); - // Add out custom services + // Add our custom services builder.Services.AddSingleton<IConfigService, ConfigService>(); builder.Services.AddDbContextFactory<HBContext>(); builder.Services.AddScoped<ISearchService, SearchService>(); builder.Services.AddScoped<ITagService, TagService>(); builder.Services.AddScoped<IMediaService, MediaService>(); builder.Services.AddSingleton<IUserService, UserService>(); + builder.Services.AddHostedService<OcrService>(); var app = builder.Build(); diff --git a/Server.csproj b/Server.csproj index 0997d2e..45be4f1 100644 --- a/Server.csproj +++ b/Server.csproj @@ -31,6 +31,8 @@ <PackageReference Include="Mime-Detective" Version="23.6.1" /> <PackageReference Include="Npgsql.EntityFrameworkCore.PostgreSQL" Version="7.0.4" /> <PackageReference Include="Swashbuckle.AspNetCore" Version="6.5.0" /> + <PackageReference Include="System.Drawing.Common" Version="7.0.0" /> + <PackageReference Include="Tesseract" Version="5.2.0" /> </ItemGroup> </Project> diff --git a/Services/OcrService.cs b/Services/OcrService.cs new file mode 100644 index 0000000..743f8f6 --- /dev/null +++ b/Services/OcrService.cs @@ -0,0 +1,116 @@ +using HyperBooru.Util; +using Microsoft.EntityFrameworkCore; +using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Text.RegularExpressions; +using Tesseract; + +namespace HyperBooru.Services; + +public class OcrService : IHostedService { + private readonly TimeSpan ProcessInterval = TimeSpan.FromMinutes(30); + private readonly TimeSpan StartupDelay = TimeSpan.FromSeconds(30); + + private readonly Regex SpaceRegex = new(@"[\s\n\r]+", RegexOptions.Compiled); + + private Task? task; + private CancellationTokenSource cts = new(); + + private Timer timer; + + private IServiceScopeFactory scopeFactory; + private ILogger<OcrService> logger; + private IDbContextFactory<HBContext> dbFactory; + + public OcrService( + IServiceScopeFactory scopeFactory, + ILogger<OcrService> logger, + IDbContextFactory<HBContext> dbFactory) { + + this.scopeFactory = scopeFactory; + this.logger = logger; + this.dbFactory = dbFactory; + + timer = new((object? state) => { + if(task is not null && !task.IsCompleted) + return; + cts = new(); + task = ProcessAllAsync(cts.Token); + }); + } + + public Task StartAsync(CancellationToken ct) { + logger.LogInformation("Service starting..."); + timer.Change(StartupDelay, ProcessInterval); + return Task.CompletedTask; + } + + public Task StopAsync(CancellationToken ct) { + logger.LogInformation("Service stopping..."); + timer.Change(Timeout.Infinite, Timeout.Infinite); + cts.Cancel(); + return Task.CompletedTask; + } + + async Task ProcessAllAsync(CancellationToken ct) { + using var scope = scopeFactory.CreateScope(); + var mediaService = scope.ServiceProvider + .GetRequiredService<IMediaService>(); + + using var db = dbFactory.CreateDbContext(); + Guid[] guids = db.Media + .Include(m => m.OcrData) + .Where(m => m.OcrData == null) + .Select(m => m.Guid) + .ToArray(); + db.Dispose(); + + logger.LogInformation($"Performing OCR pass on {guids.Count()} media items"); + + var factory = new TaskFactory(new LimitedConcurrencyTaskScheduler()); + var tasks = new List<Task>(); + + var stopwatch = Stopwatch.StartNew(); + + foreach(var guid in guids) + tasks.Add(factory.StartNew(() => Process(guid, mediaService), ct)); + + await Task.WhenAll(tasks); + stopwatch.Stop(); + + var time = stopwatch.Elapsed.ToStringHumanReadable(); + logger.LogInformation( + $"Performed OCR pass on {guids.Count()} media items in {time}"); + } + + private void Process(Guid media, IMediaService mediaService) { + logger.LogDebug($"Performing OCR on media item {media}"); + + using var db = dbFactory.CreateDbContext(); + var m = db.Media + .Include(m => m.OcrData) + .First(m => m.Guid == media); + + OcrData o = m.OcrData ?? new(); + + using var engine = new TesseractEngine("tessdata", "eng", EngineMode.Default); + using var image = Pix.LoadFromFile(mediaService.GetPath(m)); + engine.SetVariable("debug_file", NullFile); + + o.Timestamp = DateTime.UtcNow; + o.Text = engine.Process(image).GetText(); + o.SearchableText = SpaceRegex.Replace(o.Text.ToLower(), " "); + + m.OcrData = o; + db.SaveChanges(); + } + + private string NullFile { + get { + if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + return "NUL"; + else + return "/dev/null"; + } + } +} diff --git a/Services/SearchService.cs b/Services/SearchService.cs index e8e497d..bb2963d 100644 --- a/Services/SearchService.cs +++ b/Services/SearchService.cs @@ -24,33 +24,77 @@ public class SearchService : ISearchService { query = query.ToLower(); + int[] descriptionResults = SearchDescription(query); + int[] ocrResults = SearchOcr(query); + var matchedTag = db.TagDefinitions .FirstOrDefault(td => td.Name.ToLower() == query); int[] tags; - if(matchedTag is not null) { tags = tagService .TagsThatImply(matchedTag) .Select(td => td.ObjectId) .ToArray(); } else { - // TODO: expand scope to all tags that imply + // TODO: Expand scope to all tags that imply tags = db.TagDefinitions .Where(td => td.Name.ToLower().Contains(query)) .Select(td => td.ObjectId) .ToArray(); } + int[] tagResults = SearchTags(tags); + + int[] mediaIds = descriptionResults + .Union(ocrResults) + .Union(tagResults) + .OrderDescending() + .ToArray(); + return db.Media .Include(m => m.Tags) - .AsEnumerable() - .Where(m => m.Tags.IntersectBy(tags, t => t.TagDefinitionId).Any()) - .Concat(db.Media + .Where(m => mediaIds.Contains(m.ObjectId)) + .ToArray(); + } + + // TODO: Make asynchronous + private int[] SearchTags(int[] tags) { + return Task.Run(() => { + using var db = dbFactory.CreateDbContext(); + return db.Media + .Include(m => m.Tags) + .AsEnumerable() + .Where(m => m.Tags.IntersectBy(tags, t => t.TagDefinitionId).Any()) + .Select(m => m.ObjectId) + .ToArray(); + }).GetAwaiter().GetResult(); + } + + // TODO: Make asynchronous + private int[] SearchDescription(string query) { + return Task.Run(() => { + using var db = dbFactory.CreateDbContext(); + query = query.ToLower(); + return db.Media .Where(m => (m.ShortDescription != null && m.ShortDescription.ToLower().Contains(query)) || - (m.LongDescription != null && m.LongDescription.ToLower().Contains(query)))) - .DistinctBy(m => m.ObjectId) - .ToArray(); + (m.LongDescription != null && m.LongDescription.ToLower().Contains(query))) + .Select(m => m.ObjectId) + .ToArray(); + }).GetAwaiter().GetResult(); + } + + // TODO: Make asynchronous + private int[] SearchOcr(string query) { + return Task.Run(() => { + using var db = dbFactory.CreateDbContext(); + query = query.ToLower(); + return db.OcrData + .Include(o => o.Media) + .Where(o => o.SearchableText.Contains(query)) + .Select(o => o.Media.ObjectId) + .ToArray(); + }).GetAwaiter().GetResult(); } } @@ -6,6 +6,7 @@ - Setting implicit tags removes builtin tags - UserService listeners don't seem to be removed after disposal - Cancelling tag creation creates the tag anyway + - Prevent marking tagging complete unless there are actually user tags # Short-term Features - Progressive page loading @@ -22,7 +23,9 @@ - Collections - Jump into ingest feed at random point - Rating system - - OCR character recognition + - Instantaneous OCR processing when media is uploaded + - OCR status reporting on admin page + - Dynamically update OCR data on ViewMedia page - Image deduplication by visual similarity - Rating system - Audit log @@ -18,4 +18,103 @@ public static class Extensions { double n = x / Math.Pow(10, exp / 3 * 3); return $"{Math.Round(n, 2 - (exp % 3))} {suffix}B"; } + + public static string ToStringHumanReadable(this TimeSpan t) { + if(t.TotalMilliseconds < 1000) + return string.Format("{0:0}ms", t.TotalMilliseconds); + if(t.TotalSeconds < 60) + return string.Format("{0:0.00}s", t.TotalSeconds); + if(t.TotalMinutes < 60) + return string.Format("{0:0}m{0:0}s", t.TotalMinutes, t.Seconds); + if(t.TotalHours < 24) + return string.Format("{0:0}h{0:0}m", t.TotalHours, t.Minutes); + return string.Format("{0:0.00}d", t.TotalDays); + } +} + +public class LimitedConcurrencyTaskScheduler : TaskScheduler { + public sealed override int MaximumConcurrencyLevel => + maxConcurrency; + + private int maxConcurrency; + + [ThreadStatic] + private static bool threadIsProcessingItems; + + private readonly LinkedList<Task> tasks = new(); + + private int delegatesQueuedOrRunning = 0; + + public LimitedConcurrencyTaskScheduler() { + maxConcurrency = Environment.ProcessorCount; + } + + public LimitedConcurrencyTaskScheduler(int maxConcurrency) { + if(maxConcurrency < 1) + throw new ArgumentOutOfRangeException("maxConcurrency must be greater than 0"); + this.maxConcurrency = (int) maxConcurrency; + } + + protected sealed override void QueueTask(Task task) { + lock(tasks) { + tasks.AddLast(task); + if(delegatesQueuedOrRunning < maxConcurrency) { + delegatesQueuedOrRunning++; + NotifyThreadPoolOfPendingWork(); + } + } + } + + private void NotifyThreadPoolOfPendingWork() { + ThreadPool.UnsafeQueueUserWorkItem(_ => { + threadIsProcessingItems = true; + try { + while(true) { + Task item; + lock(tasks) { + if(tasks.Count == 0) { + delegatesQueuedOrRunning--; + break; + } else { + item = tasks.First.Value; + tasks.RemoveFirst(); + } + } + TryExecuteTask(item); + } + } finally { + threadIsProcessingItems = false; + } + }, null); + } + + protected sealed override bool TryExecuteTaskInline(Task task, bool taskWasPreviouslyQueued) { + if(!threadIsProcessingItems) + return false; + + if(taskWasPreviouslyQueued) + return TryDequeue(task) ? TryExecuteTask(task) : false; + else + return TryExecuteTask(task); + } + + protected sealed override bool TryDequeue(Task task) { + lock(tasks) { + return tasks.Remove(task); + } + } + + protected sealed override IEnumerable<Task> GetScheduledTasks() { + bool lockTaken = false; + try { + Monitor.TryEnter(tasks, ref lockTaken); + if(lockTaken) + return tasks; + else + throw new NotSupportedException(); + } finally { + if(lockTaken) + Monitor.Exit(tasks); + } + } } diff --git a/appsettings.Development.json b/appsettings.Development.json index 770d3e9..6860045 100644 --- a/appsettings.Development.json +++ b/appsettings.Development.json @@ -3,7 +3,8 @@ "Logging": { "LogLevel": { "Default": "Information", - "Microsoft.AspNetCore": "Warning" + "Microsoft.AspNetCore": "Warning", + "HyperBooru.Services.OcrService": "Debug" } } } diff --git a/tessdata/eng.traineddata b/tessdata/eng.traineddata Binary files differnew file mode 100644 index 0000000..176dc32 --- /dev/null +++ b/tessdata/eng.traineddata diff --git a/wwwroot/styles/global.css b/wwwroot/styles/global.css index c0dbe3f..b694fe4 100644 --- a/wwwroot/styles/global.css +++ b/wwwroot/styles/global.css @@ -59,6 +59,17 @@ a.nondecorated:hover { color: #999; } +code { + background: #222; + border-radius: 10px; + box-sizing: border-box; + font-family: 'Lucida Console'; + font-size: 8pt; + overflow-y: auto; + padding: 20px; + white-space: pre-line; +} + button, input[type=submit] { color: white; background: var(--col-button-pri); |
