From a7250ceacf8189db59b98d96bbcb381023192332 Mon Sep 17 00:00:00 2001 From: Saphire Date: Sat, 19 Feb 2022 23:42:47 +0700 Subject: [PATCH] Add basic HTML parsing --- .gitignore | 6 +- DataModel.cs => Models/JsonFiles.cs | 29 +- Models/ParsedContent.cs | 123 ++++++++ Models/Template.cs | 11 + Services/Generator.cs | 6 +- Services/PostsSource.cs | 4 +- Services/QuestdenParse.cs | 269 ++++++++++++++++++ .../RazorStandalone.cs | 11 +- kusaba.js | 65 ----- page_template.cshtml | 2 + quest_reader.csproj | 1 + web/main.css | 161 +++++++++++ web/main.ts | 38 +++ web/tsconfig.json | 1 + 14 files changed, 627 insertions(+), 100 deletions(-) rename DataModel.cs => Models/JsonFiles.cs (74%) create mode 100644 Models/ParsedContent.cs create mode 100644 Models/Template.cs create mode 100644 Services/QuestdenParse.cs rename RazorStandalone.cs => Services/RazorStandalone.cs (96%) delete mode 100644 kusaba.js create mode 100644 web/main.css create mode 100644 web/main.ts create mode 100644 web/tsconfig.json diff --git a/.gitignore b/.gitignore index 44d1e0b..5e655e1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,8 @@ /quests /rust /obj -/bin \ No newline at end of file +/bin +/cache +omnisharp.json +thread_*.json +*.js \ No newline at end of file diff --git a/DataModel.cs b/Models/JsonFiles.cs similarity index 74% rename from DataModel.cs rename to Models/JsonFiles.cs index 5025f0d..c9e3702 100644 --- a/DataModel.cs +++ b/Models/JsonFiles.cs @@ -1,6 +1,7 @@ -namespace QuestReader; +namespace QuestReader.Models; using System.Text.Json.Serialization; +using QuestReader.Models.ParsedContent; public record ThreadPost { @@ -8,6 +9,7 @@ public record ThreadPost public string Author { get; set; } public string Uid { get; set; } public string RawHtml { get; set; } + public ParsedContent.ParsedContent? ParsedContent { get; set; } public string? File { get; set; } public string? Filename { get; set; } public string? Title { get; set; } @@ -42,29 +44,4 @@ public record ChapterMetadata public int Start { get; set; } public int? Announce { get; set; } public int End { get; set; } -} - -public enum ParamType -{ - Invalid, - PostId, - UniqueId, - Username -} - -public enum ParamError -{ - Invalid, - NoError, - NotFound -} - -public class TemplateModel -{ - public Metadata Metadata { get; set; } - public DateTime Now { get; set; } - public List Posts { get; set; } - public List AllPosts { get; set; } - public string BaseUrl { get; set; } - public string ToolVersion { get; set; } } \ No newline at end of file diff --git a/Models/ParsedContent.cs b/Models/ParsedContent.cs new file mode 100644 index 0000000..1a65e0e --- /dev/null +++ b/Models/ParsedContent.cs @@ -0,0 +1,123 @@ +using System.Linq.Expressions; +using System.Runtime.Serialization; +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Web; + +namespace QuestReader.Models.ParsedContent; + +public class ParsedContent +{ + public Version Version { get; set; } + public IList Nodes { get; set; } +} + +class ContentConverter : JsonConverter +{ + public override ContentNode Read( + ref Utf8JsonReader reader, + Type typeToConvert, + JsonSerializerOptions options + ) + { + throw new NotImplementedException(); + } + + public override void Write( + Utf8JsonWriter writer, + ContentNode value, + JsonSerializerOptions options) + { + switch (value) + { + case null: + JsonSerializer.Serialize(writer, null as ContentNode, options); + break; + default: + { + if (value is RootNode) + throw new InvalidDataContractException("RootNode must not be used"); + var type = value.GetType(); + + JsonSerializer.Serialize(writer, value, type, options); + break; + } + } + } +} + +[JsonConverter(typeof(ContentConverter))] +public abstract class ContentNode +{ + public string Type { get => GetType().Name.Replace("Node", ""); } + + public virtual string Render(TemplateModel model) + { + throw new NotImplementedException("Rendering is not supported for this node type"); + } +} + +public class TextNode : ContentNode +{ + public string Text { get; set; } + + public override string ToString() => $"\"{Text}\""; + + public override string Render(TemplateModel model) => HttpUtility.HtmlEncode(Text); +} + +public class NewlineNode : ContentNode +{ + public override string ToString() => $"
"; + + public override string Render(TemplateModel model) => "
"; +} + +public class ReferenceNode : ContentNode +{ + public int PostId { get; set; } + public int ThreadId { get; set; } + public ReferenceType ReferenceType { get; set; } + public bool LongReference { get; set; } +} + +public enum ReferenceType +{ + QuestActive, + QuestArchive, + QuestDiscussion +} + +public abstract class ContainerNode : ContentNode +{ + public IList Nodes { get; set; } + + public override string ToString() => $"{Type} [ {string.Join(",\n", Nodes)} ]"; +} + +// A temporary container to recursively parse everything of a note before bailing and MUST NOT BE USED NORMALLY +public class RootNode : ContainerNode +{ + public override string ToString() => throw new InvalidDataContractException("RootNode must not be used"); + + public override string Render(TemplateModel model) => throw new InvalidDataContractException("RootNode must not be used"); +} + +public class QuoteNode : ContainerNode { }; + +public class BoldNode : ContainerNode { }; + +public class ItalicsNode : ContainerNode { }; + +public class StrikeoutNode : ContainerNode { }; + +public class SpoilerNode : ContainerNode { }; + +public class InlineCodeNode : ContainerNode { }; + +public class UnderlineNode : ContainerNode { }; + +public class ExternalLinkNode : ContainerNode +{ + public string Destination { get; set; } +} \ No newline at end of file diff --git a/Models/Template.cs b/Models/Template.cs new file mode 100644 index 0000000..b52f033 --- /dev/null +++ b/Models/Template.cs @@ -0,0 +1,11 @@ +namespace QuestReader.Models; + +public class TemplateModel +{ + public Metadata Metadata { get; set; } + public DateTime Now { get; set; } + public List Posts { get; set; } + public List AllPosts { get; set; } + public string BaseUrl { get; set; } + public string ToolVersion { get; set; } +} \ No newline at end of file diff --git a/Services/Generator.cs b/Services/Generator.cs index a115396..a0a20e6 100644 --- a/Services/Generator.cs +++ b/Services/Generator.cs @@ -1,6 +1,7 @@ namespace QuestReader.Services; using System.Reflection; +using QuestReader.Models; public class Generator { @@ -21,7 +22,8 @@ public class Generator var chapterAnnounces = PostsSource.Metadata.Chapters.Select(c => c.Announce ?? c.Start); - PostsSource.Accepted.Where(p => chapterAnnounces.Contains(p.Id)).ToList().ForEach(p => { + PostsSource.Accepted.Where(p => chapterAnnounces.Contains(p.Id)).ToList().ForEach(p => + { p.IsChapterAnnounce = true; p.Chapter = PostsSource.Metadata.Chapters.Single(c => (c.Announce ?? c.Start) == p.Id); }); @@ -38,7 +40,7 @@ public class Generator public string Run() { - RazorTemplate.Model = new TemplateModel + RazorTemplate.Model = new TemplateModel { Metadata = PostsSource.Metadata, Posts = PostsSource.Accepted, diff --git a/Services/PostsSource.cs b/Services/PostsSource.cs index 5e164c8..78eca60 100644 --- a/Services/PostsSource.cs +++ b/Services/PostsSource.cs @@ -3,6 +3,7 @@ namespace QuestReader.Services; using System.Text.Json; using System.Text.Json.Serialization; using System.Text.RegularExpressions; +using QuestReader.Models; public class PostsSource { @@ -27,7 +28,8 @@ public class PostsSource fileStream.Dispose(); Console.Out.WriteLine($"Loaded metadata: {Metadata}"); - Posts = Metadata.Threads.SelectMany(tId => { + Posts = Metadata.Threads.SelectMany(tId => + { using var fileStream = File.OpenRead(Path.Combine(basePath, $"thread_{tId}.json")); var threadData = JsonSerializer.Deserialize>(fileStream, options) ?? throw new InvalidDataException("Empty deserialisation result for thread data"); diff --git a/Services/QuestdenParse.cs b/Services/QuestdenParse.cs new file mode 100644 index 0000000..9068d89 --- /dev/null +++ b/Services/QuestdenParse.cs @@ -0,0 +1,269 @@ +using System.Globalization; +using System.Text.Json; +using System.Text.Json.Serialization; +using System.Text.RegularExpressions; +using System.Web; +using HtmlAgilityPack; +using QuestReader.Models; +using QuestReader.Models.ParsedContent; + +namespace QuestReader.Services; + +public class QuestdenParse +{ + static readonly Version LatestCompatibleVersion = new(1, 0, 2); + + static Regex RefRegex { get; } = new Regex(@"^ref\|(questarch|questdis|quest)\|(\d+)\|(\d+)$", RegexOptions.Compiled); + + static Regex LongRefRegex { get; } = new Regex(@"(?:https?://)?(www.)?(tgchan|questden).org/kusaba/(questarch|questdis|quest)/res/(\d+).html#?i?(\d+)?$", RegexOptions.Compiled); + + static Regex DateRegex { get; } = new Regex(@"(\d{4,4})\/(\d\d)\/(\d\d)\(\w+\)(\d\d):(\d\d)", RegexOptions.Compiled); + + static Regex FilenameRegex { get; } = new Regex(@"File \d+\.[^ ]+ - \([\d\.KMG]+B , \d+x\d+ , (.*) \)", RegexOptions.Compiled); + + public static async Task GetThread(int threadId) + { + var url = $"http://questden.org/kusaba/quest/res/{threadId}.html"; + var options = new JsonSerializerOptions + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + WriteIndented = true + }; + + var doc = new HtmlDocument(); + doc.OptionEmptyCollection = true; + + if (File.Exists($"thread_{threadId}.json")) + return; + + var cacheFile = $"cache/QuestDen-{threadId}.html"; + if (!File.Exists(cacheFile)) + { + var httpClient = new HttpClient(); + var content = await httpClient.GetStringAsync(url); + if (!Directory.Exists("cache")) + Directory.CreateDirectory("cache"); + File.WriteAllText(cacheFile, content); + doc.LoadHtml(content); + } + else + { + doc.LoadHtml(File.ReadAllText(cacheFile)); + } + + var nodes = doc.DocumentNode.SelectNodes(".//*[@class='reply']|.//form[@id='delform']"); + + var posts = new List(); + foreach (var node in nodes) + { + var post = ParsePost(node, threadId); + posts.Add(post); + //var postJson = JsonSerializer.Serialize(post); + //Console.Out.WriteLine($"{postJson}\n"); + } + File.WriteAllText($"thread_{threadId}.json", JsonSerializer.Serialize(posts, options)); + } + public static ThreadPost ParsePost(string postHtml, int threadId) + { + var htmlDoc = new HtmlDocument(); + htmlDoc.LoadHtml(postHtml); + return ParsePost(htmlDoc.DocumentNode.FirstChild, threadId); + } + + public static ThreadPost ParsePost(HtmlNode postNode, int threadId) + { + var post = new ThreadPost { }; + + var id = postNode + .SelectNodes("./div[@class='postwidth']/a[@name!='s']") + .Single() + .Attributes["name"].Value.Trim(); + post.Id = id == "s" ? threadId : int.Parse(id); + post.Title = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='filetitle']") + .SingleOrDefault() + ?.InnerText.Trim(); + post.Author = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='postername']") + .Single() + .InnerText.Trim(); + post.Uid = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='uid']") + .Single() + .InnerText.Trim() + .Replace("ID: ", "", true, CultureInfo.InvariantCulture); + post.Date = DateTime.Parse( + DateRegex.Replace(postNode + .SelectNodes("./div[@class='postwidth']/label/text()[last()]") + .Single() + .InnerText.Trim(), + "$1-$2-$3T$4:$5" + ), + null, + DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal + ); + post.File = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='filesize']/a") + .SingleOrDefault() + ?.Attributes["href"].Value.Trim(); + + var filenameRaw = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='filesize']") + .SingleOrDefault() + ?.InnerText.Trim(); + if (filenameRaw is not null) + { + filenameRaw = Regex.Replace(filenameRaw, @"\s*\n\s*", " ", RegexOptions.Multiline); + post.Filename = filenameRaw.Contains("File ") ? + FilenameRegex.Match(filenameRaw)?.Groups[1]?.Value + ?? null : null; + } + + post.Tripcode = postNode + .SelectNodes("./div[@class='postwidth']//*[@class='postertrip']") + .SingleOrDefault() + ?.InnerText.Trim(); + post.RawHtml = Regex.Replace( + postNode + .SelectNodes("./blockquote") + .Single() + .InnerHtml + .Replace("\r", " ") + .Replace(@"

", "") + .Trim(), + @"\s*$", + "" + ); + try + { + post.ParsedContent = ParseContent(post.RawHtml); + } + catch (FormatException) + { + Console.WriteLine($"\n{post.Id} {post.RawHtml.Replace("\r", "")}\n"); + throw; + } + + return post; + } + + public static ParsedContent ParseContent(string postHtml) + { + var htmlDoc = new HtmlDocument(); + htmlDoc.LoadHtml(postHtml); + var rootNode = RecursiveParse(htmlDoc.DocumentNode); + if (rootNode is not RootNode) + throw new Exception("Parsing returned a non-RootNode root"); + return new ParsedContent + { + Version = LatestCompatibleVersion, + Nodes = ((RootNode)rootNode).Nodes + }; + } + + private static ContentNode RecursiveParse(HtmlNode node, ContentNode? parentNode = null) + { + if (node is null) + throw new NullReferenceException("Html node is null"); + + if (node is HtmlTextNode textNode) + return new TextNode { Text = HttpUtility.HtmlDecode(textNode.Text.Trim()) }; + + if (node.NodeType is HtmlNodeType.Document or HtmlNodeType.Element) + { + ContentNode outNode = node.Name.ToLowerInvariant() switch + { + "a" when + node.GetClasses().Count() == 1 + && node.FirstChild?.NodeType == HtmlNodeType.Text + && node.Descendants().Count() == 1 + && node.GetClasses().Single() is var classname + && RefRegex.Match(classname) is var match && match is not null + && match.Success + && HttpUtility.HtmlDecode(node.FirstChild?.InnerText) is var innerText && innerText is not null + && (innerText == $">>{match.Groups[3].Value}" || innerText == $">>/{match.Groups[1].Value}/{match.Groups[3].Value}") + => new ReferenceNode + { + PostId = int.Parse(match.Groups[3].Value), + ThreadId = int.Parse(match.Groups[2].Value), + ReferenceType = match.Groups[1].Value switch + { + "quest" => ReferenceType.QuestActive, + "questarch" => ReferenceType.QuestArchive, + "questdis" => ReferenceType.QuestDiscussion, + _ => throw new InvalidDataException(""), + }, + LongReference = false + }, + "a" when + !node.GetClasses().Any() + && node.FirstChild is HtmlTextNode firstNode && firstNode is not null + && node.Descendants().Count() == 1 + && HttpUtility.HtmlDecode(firstNode.Text) is var nodeText + && node.GetAttributeValue("href", "ERROR") == nodeText + && LongRefRegex.Match(nodeText) is var match && match is not null + && match.Success + => new ReferenceNode + { + PostId = int.Parse((match.Groups[5]?.Success ?? false) ? match.Groups[5].Value : match.Groups[4].Value), + ThreadId = int.Parse(match.Groups[4].Value), + LongReference = true + }, + "a" when !node.GetClasses().Any() => new ExternalLinkNode { Destination = node.GetAttributeValue("href", "ERROR") }, + "br" => new NewlineNode { }, + "#document" => new RootNode { }, + "i" => new ItalicsNode { }, + "b" => new BoldNode { }, + "strike" => new StrikeoutNode { }, + "span" when + node.GetClasses() is var classes + && classes.Count() == 1 + && classes.Single() == "spoiler" => new SpoilerNode { }, + "span" when + node.GetClasses() is var classes + && classes.Count() == 1 + && classes.Single() == "unkfunc" => new QuoteNode { }, + "span" when + node.GetAttributes() is var attributes + && attributes.Count() == 1 + && attributes.Single() is var maybeStyle + && maybeStyle.Name == "style" + && maybeStyle.DeEntitizeValue == @"border-bottom: 1px solid" + => new UnderlineNode { }, + "span" when + node.Descendants().Where( + d => d is not HtmlTextNode + || (d is HtmlTextNode textNode + && !string.IsNullOrWhiteSpace(textNode.Text.Trim())) + ) is var descendants + && descendants.Count() == 1 + && descendants.Single() is HtmlNode innerNode + && innerNode.Name == "iframe" + && innerNode.GetAttributeValue("src", null).Contains("youtube") + => new TextNode { Text = $"Here be youtube link {innerNode.GetAttributeValue("src", null)}"}, + "div" when + node.GetAttributes() is var attributes + && attributes.Count() == 1 + && attributes.Single() is var maybeStyle + && maybeStyle.Name == "style" + && maybeStyle.DeEntitizeValue == @"white-space: pre-wrap !important; font-family: monospace, monospace !important;" + => new InlineCodeNode { }, + _ => throw new InvalidDataException($"Unknown node parse attempt: {node.Name} #{node.Id} .{string.Join(".", node.GetClasses())}\n{node.OuterHtml}") + }; + //if (outNode is ExternalLinkNode refNode) + //Console.Out.WriteLine($"Refnode: {string.Join(", ", node.GetClasses())} {node.OuterHtml}"); + //Console.Out.WriteLine($"{node.Name}: {outNode.GetType().Name} {outNode is ContainerNode} {node.ChildNodes.Count} children, {node.Descendants().Count()} descendants"); + if (outNode is ContainerNode container) + { + container.Nodes = node.ChildNodes + .Select(n => RecursiveParse(n, container)) + .Where(n => n is not TextNode || (n is TextNode textNode && !string.IsNullOrWhiteSpace(textNode.Text))) + .ToList(); + } + return outNode; + } + + throw new Exception("Unsupported HTML node type"); + } +} diff --git a/RazorStandalone.cs b/Services/RazorStandalone.cs similarity index 96% rename from RazorStandalone.cs rename to Services/RazorStandalone.cs index 7e2094a..82c4b21 100644 --- a/RazorStandalone.cs +++ b/Services/RazorStandalone.cs @@ -1,3 +1,5 @@ +namespace QuestReader.Services; + using System.Reflection; using System.Text; using Microsoft.AspNetCore.Razor.Language; @@ -6,8 +8,6 @@ using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; using Microsoft.CodeAnalysis.Emit; -namespace QuestReader; - public class RazorStandalone { RazorProjectEngine Engine { get; set; } @@ -63,7 +63,7 @@ public class RazorStandalone } var asm = Assembly.Load(memoryStream.ToArray()); - var templateInstance = (TTemplate?) Activator.CreateInstance(asm.GetType("QuestReader.Template")); + var templateInstance = (TTemplate?)Activator.CreateInstance(asm.GetType("QuestReader.Template")); if (templateInstance is null) throw new Exception("Template is null"); @@ -127,7 +127,7 @@ public abstract class StandaloneTemplate await Output.WriteAsync(literal); } - string? Suffix {get;set;} + string? Suffix { get; set; } public async Task BeginWriteAttributeAsync( string name, @@ -145,7 +145,8 @@ public abstract class StandaloneTemplate await WriteAsync(value); } - public async Task EndWriteAttributeAsync() { + public async Task EndWriteAttributeAsync() + { await WriteLiteralAsync(Suffix!); Suffix = null; } diff --git a/kusaba.js b/kusaba.js deleted file mode 100644 index 2436f7f..0000000 --- a/kusaba.js +++ /dev/null @@ -1,65 +0,0 @@ -await (async () => { - delete Array.prototype.toJSON; - - const processReply = (elem) => { - const id = +elem.querySelector(":scope > .postwidth > a[name]:not([name=s])").getAttribute("name"); - const title = elem.querySelector(":scope > .postwidth .filetitle")?.innerText.trim(); - const author = elem.querySelector(":scope > .postwidth .postername").innerText.trim(); - const uid = elem.querySelector(":scope > .postwidth .uid").innerText.replace("ID: ", ""); - const file = elem.querySelector(":scope > .postwidth > .filesize > a")?.href ?? undefined; - const postertrip = elem.querySelector(":scope > .postwidth .postertrip")?.innerText.trim(); - const rawHtml = elem.querySelector(":scope > blockquote").innerHTML - .replace(`

`,"") - .trim(); - const date = [...elem.querySelector(":scope > .postwidth > label").childNodes] - .pop().data.trim() - .replace( - /(\d{4,4})\/(\d\d)\/(\d\d)\(\w+\)(\d\d):(\d\d)/, - "$1-$2-$3T$4:$5:00Z" - ) - .replace( - /href=\\"\/kusaba\/questarch\/res\/\d+.html#\d+\\" onclick=\\"return highlight\('\d+', true\);\\"/, - "" - ); - - const filenameRaw = elem.querySelector(":scope > .postwidth > .filesize")?.innerText; - const filename = filenameRaw?.includes("File ") ? - filenameRaw.match(/File \d+\.[^ ]+ - \([\d\.KMG]+B , \d+x\d+ , (.*) \)/)[1] - ?? undefined : undefined; - const ret = { - id, - author, - uid, - rawHtml, - date - } - if (file) ret.file = file; - if (file) ret.filename = filename; - if (postertrip) ret.tripcode = postertrip; - if (title) ret.title = title; - return ret; - } - - const replies = [...document.getElementsByClassName("reply")]; - replies.unshift(document.getElementById("delform")) - - const processed = replies.map(elem => processReply(elem)); - - const blob = new Blob( - [JSON.stringify(processed, null, 4)], - {type : 'application/json'} - ) - - const a = document.createElement("a"); - const url = URL.createObjectURL(blob); - a.href = url; - a.download = `thread_${processed[0].id}.json`; - document.body.appendChild(a); - a.click(); - URL.revokeObjectURL(url); - a.remove(); - - - return ; - -})(); \ No newline at end of file diff --git a/page_template.cshtml b/page_template.cshtml index fc0575e..8efc8db 100644 --- a/page_template.cshtml +++ b/page_template.cshtml @@ -1,6 +1,8 @@ @namespace QuestReader @using System @using System.Linq +@using QuestReader.Models +@using QuestReader.Services @inherits StandaloneTemplate diff --git a/quest_reader.csproj b/quest_reader.csproj index e287624..d7e1cf0 100644 --- a/quest_reader.csproj +++ b/quest_reader.csproj @@ -18,6 +18,7 @@ 1.0.1 + QuestReader this.init(), false); + } + + init() { + let options = { + root: null, + rootMargin: "0px", + threshold: 1.0 + }; + + this.observer = new IntersectionObserver((entries, observer) => this.handleIntersect(entries, observer), options); + + var all = document.querySelectorAll(".chapter-announce"); + all.forEach(elem => this.observer.observe(elem)); + this.observer.observe(document.querySelector("footer")); + console.log("Intersection observer ready"); + } + + handleIntersect(entries: IntersectionObserverEntry[], observer: IntersectionObserver) { + entries.filter(e => e.isIntersecting).forEach(e => { + if (e.target.className == "chapter-announce" && !e.target.nextElementSibling.querySelector("img").complete) + return; + window.plausible("landmark", {props: {id: e.target.id ? e.target.id : e.target.tagName}}); + observer.unobserve(e.target); + console.log("Reached landmark " + e.target.id ? e.target.id : e.target.tagName); + } + ); + } +} +new VisitAnalytics(); \ No newline at end of file diff --git a/web/tsconfig.json b/web/tsconfig.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/web/tsconfig.json @@ -0,0 +1 @@ +{} \ No newline at end of file