Get parsing working, and HTML generation from resulting objects

This commit is contained in:
Saphire 2022-02-21 02:27:27 +07:00
parent a7250ceacf
commit 38f317afd3
Signed by: Saphire
GPG Key ID: B26EB7A1F07044C4
9 changed files with 171 additions and 146 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@
/obj
/bin
/cache
output.html
omnisharp.json
thread_*.json
*.js

View File

@ -9,7 +9,7 @@ public record ThreadPost
public string Author { get; set; }
public string Uid { get; set; }
public string RawHtml { get; set; }
public ParsedContent.ParsedContent? ParsedContent { get; set; }
public RootNode? ParsedContent { get; set; }
public string? File { get; set; }
public string? Filename { get; set; }
public string? Title { get; set; }
@ -18,8 +18,10 @@ public record ThreadPost
[JsonIgnore]
public bool IsChapterAnnounce { get; set; } = false;
[JsonIgnore]
public ChapterMetadata? Chapter { get; set; }
public List<int>? RepliesTo { get; set; }
[JsonIgnore]
public bool AuthorPost { get; set; } = false;
}
public record Metadata

View File

@ -6,10 +6,28 @@ using System.Web;
namespace QuestReader.Models.ParsedContent;
public class ParsedContent
[JsonConverter(typeof(ContentConverter))]
public abstract class ContentNode
{
public string Type { get => GetType().Name.Replace("Node", ""); }
}
public abstract class ContainerNode : ContentNode
{
public Version Version { get; set; }
public IList<ContentNode> Nodes { get; set; }
public override string ToString() => $"{Type} [ {string.Join(",\n", Nodes)} ]";
public IEnumerable<int> GetReferences()
{
return Nodes.SelectMany(n =>
n is ContainerNode container
? container.GetReferences()
: (
n is ReferenceNode @ref ? new List<int> { @ref.PostId ?? @ref.ThreadId } : Array.Empty<int>()
)
);
}
}
class ContentConverter : JsonConverter<ContentNode>
@ -28,54 +46,42 @@ class ContentConverter : JsonConverter<ContentNode>
ContentNode value,
JsonSerializerOptions options)
{
switch (value)
{
switch (value) {
case null:
JsonSerializer.Serialize(writer, null as ContentNode, options);
break;
case TextNode textNode:
JsonSerializer.Serialize(writer, textNode.Text, options);
break;
default:
{
if (value is RootNode)
throw new InvalidDataContractException("RootNode must not be used");
var type = value.GetType();
JsonSerializer.Serialize(writer, value, type, options);
break;
}
}
var type = value.GetType();
JsonSerializer.Serialize(writer, value, type, options);
break;
};
}
}
[JsonConverter(typeof(ContentConverter))]
public abstract class ContentNode
public class RootNode : ContainerNode
{
public string Type { get => GetType().Name.Replace("Node", ""); }
public virtual string Render(TemplateModel model)
{
throw new NotImplementedException("Rendering is not supported for this node type");
}
public Version Version { get; set; }
}
public class TextNode : ContentNode
{
public string Text { get; set; }
public override string ToString() => $"\"{Text}\"";
public override string Render(TemplateModel model) => HttpUtility.HtmlEncode(Text);
public override string ToString() => $"{Text}";
}
public class NewlineNode : ContentNode
{
public override string ToString() => $"<br>";
public override string Render(TemplateModel model) => "<br>";
public override string ToString() => $"\n";
}
public class ReferenceNode : ContentNode
{
public int PostId { get; set; }
public int? PostId { get; set; }
public int ThreadId { get; set; }
public ReferenceType ReferenceType { get; set; }
public bool LongReference { get; set; }
@ -88,20 +94,11 @@ public enum ReferenceType
QuestDiscussion
}
public abstract class ContainerNode : ContentNode
public class YoutubeEmbedNode : ContentNode
{
public IList<ContentNode> Nodes { get; set; }
public override string ToString() => $"{Type} [ {string.Join(",\n", Nodes)} ]";
}
// A temporary container to recursively parse everything of a note before bailing and MUST NOT BE USED NORMALLY
public class RootNode : ContainerNode
{
public override string ToString() => throw new InvalidDataContractException("RootNode must not be used");
public override string Render(TemplateModel model) => throw new InvalidDataContractException("RootNode must not be used");
}
/// <remarks>Todo: Make this a URL</remarks>
public string VideoLink { get; set; }
};
public class QuoteNode : ContainerNode { };
@ -117,7 +114,15 @@ public class InlineCodeNode : ContainerNode { };
public class UnderlineNode : ContainerNode { };
public class SmallFontNode : ContainerNode { };
public class ColorNode : ContainerNode
{
public string Color { get; set; }
};
public class ExternalLinkNode : ContainerNode
{
/// <remarks>Todo: Make this a URL</remarks>
public string Destination { get; set; }
}

View File

@ -6,6 +6,6 @@ public class TemplateModel
public DateTime Now { get; set; }
public List<ThreadPost> Posts { get; set; }
public List<ThreadPost> AllPosts { get; set; }
public string BaseUrl { get; set; }
public string AssetsPath { get; set; }
public string ToolVersion { get; set; }
}

View File

@ -11,31 +11,23 @@ public class Generator
public PostsSource PostsSource { get; set; }
public string QuestPath { get; set; }
public string AssetsPath { get; set; }
public string OutputPath { get; set; }
public Generator(string questName)
{
QuestPath = $"quests/{questName}";
QuestName = questName;
PostsSource = new PostsSource(questName, QuestPath);
var chapterAnnounces = PostsSource.Metadata.Chapters.Select(c => c.Announce ?? c.Start);
PostsSource.Accepted.Where(p => chapterAnnounces.Contains(p.Id)).ToList().ForEach(p =>
{
p.IsChapterAnnounce = true;
p.Chapter = PostsSource.Metadata.Chapters.Single(c => (c.Announce ?? c.Start) == p.Id);
});
AssetsPath = $"/static/{questName}";
PostsSource = new PostsSource(questName);
var razorEngine = new RazorStandalone<StandaloneTemplate<TemplateModel>>("QuestReader");
var templateFile = "page_template.cshtml";
var baseUrl = "";
RazorTemplate = razorEngine.Compile(
"page_template.cshtml"
) ?? throw new Exception("No template");
Console.WriteLine($"Using \"{templateFile}\" with base URL {baseUrl}");
Console.WriteLine($"Using \"{templateFile}\" with base URL {AssetsPath}");
}
public string Run()
@ -43,17 +35,17 @@ public class Generator
RazorTemplate.Model = new TemplateModel
{
Metadata = PostsSource.Metadata,
Posts = PostsSource.Accepted,
Posts = PostsSource.Accepted.ToList(),
AllPosts = PostsSource.Posts,
Now = @DateTime.UtcNow,
BaseUrl = $"/static/{QuestName}",
AssetsPath = AssetsPath.TrimEnd('/'), // Strip trailing slash
ToolVersion = Assembly.GetEntryAssembly()?.GetCustomAttribute<AssemblyInformationalVersionAttribute>()?.InformationalVersion ?? "unknown"
};
var outputStream = new MemoryStream();
RazorTemplate.ExecuteAsync(outputStream).Wait();
var outputPath = Path.Join(QuestPath, "output.html");
var outputPath = Path.Join(OutputPath ?? PostsSource.BasePath, "output.html");
Console.WriteLine($"Template output {outputStream.Length} bytes");
File.WriteAllBytes(outputPath, outputStream.ToArray());
Console.WriteLine($"Wrote output to {outputPath}");

View File

@ -9,12 +9,16 @@ public class PostsSource
{
public List<ThreadPost> Posts { get; set; }
public List<ThreadPost> Accepted { get; set; }
public HashSet<ThreadPost> Accepted { get; set; }
public Metadata Metadata { get; set; }
public PostsSource(string questName, string basePath)
public string BasePath { get; set; }
public PostsSource(string questName)
{
BasePath = $"quests/{questName}";
var options = new JsonSerializerOptions
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
@ -22,45 +26,41 @@ public class PostsSource
WriteIndented = true
};
using var fileStream = File.OpenRead(Path.Combine(basePath, "metadata.json"));
using var fileStream = File.OpenRead(Path.Combine(BasePath, "metadata.json"));
Metadata = JsonSerializer.Deserialize<Metadata>(fileStream, options)
?? throw new InvalidDataException("Empty deserialisation result for quest metadata");
fileStream.Dispose();
Console.Out.WriteLine($"Loaded metadata: {Metadata}");
Posts = Metadata.Threads.SelectMany(tId =>
{
using var fileStream = File.OpenRead(Path.Combine(basePath, $"thread_{tId}.json"));
var threadData = JsonSerializer.Deserialize<List<ThreadPost>>(fileStream, options)
?? throw new InvalidDataException("Empty deserialisation result for thread data");
fileStream.Dispose();
Posts = Metadata.Threads
.SelectMany(tId => QuestdenParse.GetThread(tId, BasePath).Result)
.ToList();
return threadData;
}).ToList();
using var postsListStream = File.OpenRead(Path.Combine(basePath, "accepted.json"));
using var postsListStream = File.OpenRead(Path.Combine(BasePath, "accepted.json"));
var ids = JsonSerializer.Deserialize<List<int>>(postsListStream, options)
?? throw new InvalidDataException("Empty deserialisation result for quest metadata");
Accepted = Posts.Where(p => ids.Contains(p.Id)).ToList();
Accepted = Posts.Where(p => ids.Contains(p.Id)).ToHashSet();
foreach (var chapter in Metadata.Chapters)
{
var post = Accepted.Single(p => p.Id == (chapter.Announce ?? chapter.Start));
post.IsChapterAnnounce = true;
post.Chapter = chapter;
}
Console.Out.WriteLine($"Loaded a list of {Accepted.Count} posts, referencing {Accepted.Where(a => a.File is not null).Count()} files");
var rx = new Regex(@"data-post-ref=""(\d+)""",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
foreach (var post in Posts)
foreach (var post in Accepted)
{
var matches = rx.Matches(post.RawHtml);
if (!matches.Any())
continue;
post.RepliesTo = new List<int>();
foreach (Match match in matches)
{
var replyId = int.Parse(match.Groups[1].Value);
var found = Posts.FirstOrDefault(p => p.Id == replyId);
if (found is null)
continue;
post.RepliesTo.Add(replyId);
}
post.AuthorPost = true;
if (post.ParsedContent is null || post.ParsedContent.Version < QuestdenParse.LatestCompatibleVersion)
throw new NotImplementedException("Repairing missing post content or updating it is not implemented yet");
}
var referenced = Accepted.SelectMany(p => p.ParsedContent!.GetReferences());
Accepted.UnionWith(Posts.Where(p => referenced.Contains(p.Id)));
Accepted = Accepted.OrderBy(p => p.Id).ToHashSet();
Console.Out.WriteLine($"Done loading with {Accepted.Count} posts, referencing {Accepted.Where(a => a.File is not null).Count()} files");
}
}

View File

@ -11,17 +11,17 @@ namespace QuestReader.Services;
public class QuestdenParse
{
static readonly Version LatestCompatibleVersion = new(1, 0, 2);
public static readonly Version LatestCompatibleVersion = new(1, 0, 2);
static Regex RefRegex { get; } = new Regex(@"^ref\|(questarch|questdis|quest)\|(\d+)\|(\d+)$", RegexOptions.Compiled);
static Regex LongRefRegex { get; } = new Regex(@"(?:https?://)?(www.)?(tgchan|questden).org/kusaba/(questarch|questdis|quest)/res/(\d+).html#?i?(\d+)?$", RegexOptions.Compiled);
static Regex LongRefRegex { get; } = new Regex(@"(?:https?://)?(?:www.)?(?:tgchan|questden).org/kusaba/(questarch|questdis|quest)/res/(\d+).html#?i?(\d+)?$", RegexOptions.Compiled);
static Regex DateRegex { get; } = new Regex(@"(\d{4,4})\/(\d\d)\/(\d\d)\(\w+\)(\d\d):(\d\d)", RegexOptions.Compiled);
static Regex FilenameRegex { get; } = new Regex(@"File \d+\.[^ ]+ - \([\d\.KMG]+B , \d+x\d+ , (.*) \)", RegexOptions.Compiled);
public static async Task GetThread(int threadId)
public static async Task<IEnumerable<ThreadPost>> GetThread(int threadId, string destinationPath)
{
var url = $"http://questden.org/kusaba/quest/res/{threadId}.html";
var options = new JsonSerializerOptions
@ -31,26 +31,29 @@ public class QuestdenParse
WriteIndented = true
};
var doc = new HtmlDocument();
doc.OptionEmptyCollection = true;
var doc = new HtmlDocument
{
OptionEmptyCollection = true
};
if (File.Exists($"thread_{threadId}.json"))
return;
// Todo: check if the thread data & parsed entity is of same version
if (File.Exists(Path.Join(destinationPath, $"thread_{threadId}.json")))
return JsonSerializer.Deserialize<IEnumerable<ThreadPost>>(File.ReadAllText("asd"), options)
?? throw new NullReferenceException("No data loaded");
var cacheFile = $"cache/QuestDen-{threadId}.html";
var cacheDir = Path.Join(destinationPath, "cache");
var cacheFile = Path.Join(cacheDir, $"QuestDen-{threadId}.html");
if (!File.Exists(cacheFile))
{
var httpClient = new HttpClient();
var content = await httpClient.GetStringAsync(url);
if (!Directory.Exists("cache"))
Directory.CreateDirectory("cache");
if (!Directory.Exists(cacheDir))
Directory.CreateDirectory(cacheDir);
File.WriteAllText(cacheFile, content);
doc.LoadHtml(content);
}
else
{
doc.LoadHtml(File.ReadAllText(cacheFile));
}
var nodes = doc.DocumentNode.SelectNodes(".//*[@class='reply']|.//form[@id='delform']");
@ -59,11 +62,11 @@ public class QuestdenParse
{
var post = ParsePost(node, threadId);
posts.Add(post);
//var postJson = JsonSerializer.Serialize(post);
//Console.Out.WriteLine($"{postJson}\n");
}
File.WriteAllText($"thread_{threadId}.json", JsonSerializer.Serialize(posts, options));
return posts;
}
public static ThreadPost ParsePost(string postHtml, int threadId)
{
var htmlDoc = new HtmlDocument();
@ -106,7 +109,7 @@ public class QuestdenParse
post.File = postNode
.SelectNodes("./div[@class='postwidth']//*[@class='filesize']/a")
.SingleOrDefault()
?.Attributes["href"].Value.Trim();
?.Attributes["href"].DeEntitizeValue.Replace("/kusaba/questarch/src/", "").Trim();
var filenameRaw = postNode
.SelectNodes("./div[@class='postwidth']//*[@class='filesize']")
@ -148,18 +151,14 @@ public class QuestdenParse
return post;
}
public static ParsedContent ParseContent(string postHtml)
public static RootNode ParseContent(string postHtml)
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(postHtml);
var rootNode = RecursiveParse(htmlDoc.DocumentNode);
if (rootNode is not RootNode)
var parseResult = RecursiveParse(htmlDoc.DocumentNode);
if (parseResult is not RootNode rootNode)
throw new Exception("Parsing returned a non-RootNode root");
return new ParsedContent
{
Version = LatestCompatibleVersion,
Nodes = ((RootNode)rootNode).Nodes
};
return rootNode;
}
private static ContentNode RecursiveParse(HtmlNode node, ContentNode? parentNode = null)
@ -167,8 +166,12 @@ public class QuestdenParse
if (node is null)
throw new NullReferenceException("Html node is null");
if (node is HtmlTextNode textNode)
return new TextNode { Text = HttpUtility.HtmlDecode(textNode.Text.Trim()) };
if (node is HtmlTextNode textNode) {
var decoded = HttpUtility.HtmlDecode(textNode.Text.Trim());
if (parentNode is QuoteNode)
decoded = Regex.Replace(decoded, @"^>\s*", "");
return new TextNode { Text = decoded };
}
if (node.NodeType is HtmlNodeType.Document or HtmlNodeType.Element)
{
@ -206,13 +209,20 @@ public class QuestdenParse
&& match.Success
=> new ReferenceNode
{
PostId = int.Parse((match.Groups[5]?.Success ?? false) ? match.Groups[5].Value : match.Groups[4].Value),
ThreadId = int.Parse(match.Groups[4].Value),
PostId = int.Parse((match.Groups[3]?.Success ?? false) ? match.Groups[3].Value : match.Groups[2].Value),
ThreadId = int.Parse(match.Groups[2].Value),
ReferenceType = match.Groups[1].Value switch
{
"quest" => ReferenceType.QuestActive,
"questarch" => ReferenceType.QuestArchive,
"questdis" => ReferenceType.QuestDiscussion,
_ => throw new InvalidDataException(""),
},
LongReference = true
},
"a" when !node.GetClasses().Any() => new ExternalLinkNode { Destination = node.GetAttributeValue("href", "ERROR") },
"br" => new NewlineNode { },
"#document" => new RootNode { },
"#document" => new RootNode { Version = LatestCompatibleVersion },
"i" => new ItalicsNode { },
"b" => new BoldNode { },
"strike" => new StrikeoutNode { },
@ -231,6 +241,24 @@ public class QuestdenParse
&& maybeStyle.Name == "style"
&& maybeStyle.DeEntitizeValue == @"border-bottom: 1px solid"
=> new UnderlineNode { },
"span" when
node.GetAttributes() is var attributes
&& attributes.Count() == 1
&& attributes.Single() is var maybeStyle
&& maybeStyle.Name == "style"
&& maybeStyle.DeEntitizeValue == @"font-size:small;"
=> new SmallFontNode { },
"span" when
node.GetAttributes() is var attributes
&& attributes.Count() == 1
&& attributes.Single() is var maybeStyle
&& maybeStyle.Name == "style"
// Let's hope nobody used any colors beyond the hex ones...
// But probably will need to add support for that. Eh, later!
&& Regex.Match(maybeStyle.DeEntitizeValue, @"^color:\s*(#[0-9a-f]{3,8});?$", RegexOptions.IgnoreCase) is var match
&& match is not null
&& match.Success
=> new ColorNode { Color = match.Groups[1].Value },
"span" when
node.Descendants().Where(
d => d is not HtmlTextNode
@ -241,8 +269,9 @@ public class QuestdenParse
&& descendants.Single() is HtmlNode innerNode
&& innerNode.Name == "iframe"
&& innerNode.GetAttributeValue("src", null).Contains("youtube")
=> new TextNode { Text = $"Here be youtube link {innerNode.GetAttributeValue("src", null)}"},
"div" when
=> new YoutubeEmbedNode { VideoLink = innerNode.GetAttributes().Single(a => a.Name == "src").DeEntitizeValue },
// I have seen both being used but I am not sure as to the difference. Different software version?
"div" or "span" when
node.GetAttributes() is var attributes
&& attributes.Count() == 1
&& attributes.Single() is var maybeStyle
@ -251,9 +280,6 @@ public class QuestdenParse
=> new InlineCodeNode { },
_ => throw new InvalidDataException($"Unknown node parse attempt: {node.Name} #{node.Id} .{string.Join(".", node.GetClasses())}\n{node.OuterHtml}")
};
//if (outNode is ExternalLinkNode refNode)
//Console.Out.WriteLine($"Refnode: {string.Join(", ", node.GetClasses())} {node.OuterHtml}");
//Console.Out.WriteLine($"{node.Name}: {outNode.GetType().Name} {outNode is ContainerNode} {node.ChildNodes.Count} children, {node.Descendants().Count()} descendants");
if (outNode is ContainerNode container)
{
container.Nodes = node.ChildNodes

View File

@ -3,6 +3,7 @@
@using System.Linq
@using QuestReader.Models
@using QuestReader.Services
@using QuestReader.Extensions
@inherits StandaloneTemplate<TemplateModel>
<!DOCTYPE html>
@ -17,7 +18,7 @@
var autoDescription = $"Quest single-page archive. Generated {Model.Now} (UTC), {Model.Posts.Count} posts, {Model.Metadata.Chapters.Count} chapters";
// A hack, tbh, should be something better instead..
var description = Model.Metadata.Description ?? autoDescription;
var preview = $"https://media.lunar.exchange{Model.BaseUrl}/{Model.Metadata.SocialPreview}";
var preview = $"https://media.lunar.exchange{Model.AssetsPath}/{Model.Metadata.SocialPreview}";
}
<title>@title</title>
<link rel="stylesheet" href="main.css">
@ -57,40 +58,35 @@
</header>
<main>
@{
Func<(ThreadPost, bool), object> makePost =
@<article id="post-@item.Item1.Id" class="post@(item.Item1 is not null ? " image-post" : "")@(item.Item2 ? "" : " suggestion-post")">
@if (item.Item1.Title is not null) {
<h2 class="post-self-title">@item.Item1.Title</h2>
Func<ThreadPost, object> makePost =
@<article id="post-@item.Id" class="post@(item.File is not null ? " image-post" : "")@(item.AuthorPost ? "" : " suggestion-post")" data-postid="@item.Id">
@if (item.Title is not null) {
<h2 class="post-self-title">@item.Title</h2>
}
<h3 class="post-header"><a class="post-anchor" href="#post-@item.Item1.Id"><span class="post-anchor-mark">#</span>@item.Item1.Id</a> <span class="author">@item.Item1.Author</span> <time>@item.Item1.Date</time></h3>
<h3 class="post-header"><a class="post-anchor" href="#post-@item.Id"><span class="post-anchor-mark">#</span>@item.Id</a> <span class="author">@item.Author</span> <time>@item.Date</time></h3>
<div class="post-content">
@if (item.Item1.File is not null) {
@if (item.File is not null) {
<figure class="post-image">
<img src="@Model.BaseUrl/@item.Item1.File" alt="@item.Item1.Filename">
<img src="@Model.AssetsPath/@item.File" alt="@item.Filename">
</figure>
}
@if (item.Item1.RawHtml.Trim().Length > 0) {
<div class="post-text">@Raw(item.Item1.RawHtml)</div>
@if (item.RawHtml.Trim().Length > 0) {
<div class="post-text">@Raw(item.ParsedContent!.RenderContentHtml(Model))</div>
}
</div>
</article>;
</article>
;
}
@foreach (var item in Model.Posts)
{
@if (item.IsChapterAnnounce) {
<h2 id="chapter-@item.Chapter.Id" class="chapter-announce">
// This might nullref throw, but let's assume this bool is always set only when this is set too
<h2 id="chapter-@item.Chapter!.Id" class="chapter-announce">
<a class="chapter-anchor" href="#chapter-@item.Chapter.Id">#</a> <span class="chapter-name">@item.Chapter.Name</span> - <span class="chapter-subtitle">@item.Chapter.Subtitle</span>
</h2>
}
if (item.RepliesTo is not null && item.RepliesTo.Count > 0)
{
@foreach (var replyId in item.RepliesTo)
{
@makePost((Model.AllPosts.First(p => p.Id == replyId), false))
}
}
@makePost((item, true));
@makePost(item);
}
</main>
<footer>

View File

@ -152,9 +152,12 @@ a.post-anchor:hover .post-anchor-mark {
/* In-chapter stuff */
.quoted-text {
.text-quote {
color: var(--fg-muted);
}
.text-quote::before {
content: "> "
}
.post-reference {
color: var(--highlight-blue);