Skip to content

Commit

Permalink
* opt out entity splitting due to currently it only supporting `one-o…
Browse files Browse the repository at this point in the history
…ne` relationships instead of desired `one-zeroOrOne`(optional entity), leading to inserting many records with all fields except the primary key with NULL values as placeholder record for `one-one` relation, so without re-introducing something like `IRevision.NullFieldsBitMask`, we can't distinguish between the original literal NULL value and these empty records: dotnet/efcore#27974 dotnet/efcore#29113 @ `TbmDbContext.OnModelCreating()`

+ abstract class `RevisionWithSplitting` as base class of all derived classes of `IRevision`
* insert all entities returned from `RevisionWithSplitting.GetSplitEntities()` into DB @ `CommonInSavers.SavePostsOrUsers()`
* change `abstract class BaseRevision` to `interface IRevision` to comply with single inheritance
@ crawler
  • Loading branch information
n0099 committed Jan 12, 2023
1 parent 831ea75 commit a75c8ee
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 60 deletions.
10 changes: 0 additions & 10 deletions crawler/src/Db/Revision/BaseRevision.cs

This file was deleted.

9 changes: 9 additions & 0 deletions crawler/src/Db/Revision/IRevision.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// ReSharper disable UnusedMember.Global
namespace tbm.Crawler.Db.Revision
{
public interface IRevision
{
public uint TakenAt { get; set; }
public ushort? NullFieldsBitMask { get; set; }
}
}
41 changes: 36 additions & 5 deletions crawler/src/Db/Revision/ReplyRevision.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,45 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Revision
{
public class ReplyRevision : BaseRevision
public class ReplyRevision : ReplyRevision.BaseReplyRevision
{
public ulong Pid { get; set; }
public uint Floor { get; set; }
public uint SubReplyCount { get; set; }
public abstract class BaseReplyRevision : RevisionWithSplitting<BaseReplyRevision>
{
public ulong Pid { get; set; }
}
[NotMapped] public uint Floor
{
get => GetSplitEntityValue<SplitFloor, uint>(r => r.Floor);
set => SetSplitEntityValue<SplitFloor, uint>(value, (r, v) => r.Floor = v,
() => new() {TakenAt = TakenAt, Pid = Pid, Floor = value});
}
[NotMapped] public uint SubReplyCount
{
get => GetSplitEntityValue<SplitSubReplyCount, uint>(r => r.SubReplyCount);
set => SetSplitEntityValue<SplitSubReplyCount, uint>(value, (r, v) => r.SubReplyCount = v,
() => new() {TakenAt = TakenAt, Pid = Pid, SubReplyCount = value});
}
public ushort? IsFold { get; set; }
public int AgreeCount { get; set; }
[NotMapped] public int AgreeCount
{
get => GetSplitEntityValue<SplitAgreeCount, int>(r => r.AgreeCount);
set => SetSplitEntityValue<SplitAgreeCount, int>(value, (r, v) => r.AgreeCount = v,
() => new() {TakenAt = TakenAt, Pid = Pid, AgreeCount = value});
}
public int? DisagreeCount { get; set; }
public byte[]? Geolocation { get; set; }

public class SplitFloor : BaseReplyRevision
{
public uint Floor { get; set; }
}
public class SplitSubReplyCount : BaseReplyRevision
{
public uint SubReplyCount { get; set; }
}
public class SplitAgreeCount : BaseReplyRevision
{
public int AgreeCount { get; set; }
}
}
}
27 changes: 27 additions & 0 deletions crawler/src/Db/Revision/RevisionWithSplitting.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
namespace tbm.Crawler.Db.Revision
{
public abstract class RevisionWithSplitting<TSplitEntities> : IRevision
{
public uint TakenAt { get; set; }
public ushort? NullFieldsBitMask { get; set; }

private Dictionary<Type, TSplitEntities> SplitEntities { get; } = new();
public IEnumerable<TSplitEntities> GetSplitEntities() => SplitEntities.Values;

protected TValue? GetSplitEntityValue<TSplitEntity, TValue>(Func<TSplitEntity, TValue?> valueSelector)
where TSplitEntity : class, TSplitEntities =>
SplitEntities.ContainsKey(typeof(TSplitEntity))
? valueSelector((TSplitEntity)SplitEntities[typeof(TSplitEntity)]!)
: default;

protected void SetSplitEntityValue<TSplitEntity, TValue>(TValue? value,
Action<TSplitEntity, TValue?> valueSetter, Func<TSplitEntity> entityFactory)
where TSplitEntity : class, TSplitEntities
{
if (SplitEntities.ContainsKey(typeof(TSplitEntity)))
valueSetter((TSplitEntity)SplitEntities[typeof(TSplitEntity)]!, value);
else
SplitEntities[typeof(TSplitEntity)] = entityFactory();
}
}
}
30 changes: 26 additions & 4 deletions crawler/src/Db/Revision/SubReplyRevision.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,32 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Revision
{
public class SubReplyRevision : BaseRevision
public class SubReplyRevision : SubReplyRevision.BaseSubReplyRevision
{
public ulong Spid { get; set; }
public int AgreeCount { get; set; }
public int DisagreeCount { get; set; }
public abstract class BaseSubReplyRevision : RevisionWithSplitting<BaseSubReplyRevision>
{
public ulong Spid { get; set; }
}
[NotMapped] public int AgreeCount
{
get => GetSplitEntityValue<SplitAgreeCount, int>(r => r.AgreeCount);
set => SetSplitEntityValue<SplitAgreeCount, int>(value, (r, v) => r.AgreeCount = v,
() => new() {TakenAt = TakenAt, Spid = Spid, AgreeCount = value});
}
[NotMapped] public int DisagreeCount
{
get => GetSplitEntityValue<SplitDisagreeCount, int>(r => r.DisagreeCount);
set => SetSplitEntityValue<SplitDisagreeCount, int>(value, (r, v) => r.DisagreeCount = v,
() => new() {TakenAt = TakenAt, Spid = Spid, DisagreeCount = value});
}

public class SplitAgreeCount : BaseSubReplyRevision
{
public int AgreeCount { get; set; }
}
public class SplitDisagreeCount : BaseSubReplyRevision
{
public int DisagreeCount { get; set; }
}
}
}
19 changes: 16 additions & 3 deletions crawler/src/Db/Revision/ThreadRevision.cs
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Revision
{
public class ThreadRevision : BaseRevision
public class ThreadRevision : ThreadRevision.BaseThreadRevision
{
public ulong Tid { get; set; }
public abstract class BaseThreadRevision : RevisionWithSplitting<BaseThreadRevision>
{
public ulong Tid { get; set; }
}
public ulong? ThreadType { get; set; }
public string? StickyType { get; set; }
public string? TopicType { get; set; }
public ushort? IsGood { get; set; }
public uint? LatestReplyPostedAt { get; set; }
public long? LatestReplierUid { get; set; }
public uint? ReplyCount { get; set; }
public uint ViewCount { get; set; }
[NotMapped] public uint ViewCount
{
get => GetSplitEntityValue<SplitViewCount, uint>(r => r.ViewCount);
set => SetSplitEntityValue<SplitViewCount, uint>(value, (r, v) => r.ViewCount = v,
() => new() {TakenAt = TakenAt, Tid = Tid, ViewCount = value});
}
public uint? ShareCount { get; set; }
public int? AgreeCount { get; set; }
public int? DisagreeCount { get; set; }
public byte[]? Geolocation { get; set; }

public class SplitViewCount : BaseThreadRevision
{
public uint ViewCount { get; set; }
}
}
}
43 changes: 37 additions & 6 deletions crawler/src/Db/Revision/UserRevision.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,47 @@
// ReSharper disable PropertyCanBeMadeInitOnly.Global
namespace tbm.Crawler.Db.Revision
{
public class UserRevision : BaseRevision
public class UserRevision : UserRevision.BaseUserRevision
{
public long Uid { get; set; }
public string TriggeredBy { get; set; } = "";
public abstract class BaseUserRevision : RevisionWithSplitting<BaseUserRevision>
{
public long Uid { get; set; }
public string TriggeredBy { get; set; } = "";
}
public string? Name { get; set; }
public string? DisplayName { get; set; }
[NotMapped] public string? DisplayName
{
get => GetSplitEntityValue<SplitDisplayName, string?>(r => r.DisplayName);
set => SetSplitEntityValue<SplitDisplayName, string?>(value, (r, v) => r.DisplayName = v,
() => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, DisplayName = value});
}
public string? Portrait { get; set; }
public uint? PortraitUpdatedAt { get; set; }
[NotMapped] public uint? PortraitUpdatedAt
{
get => GetSplitEntityValue<SplitPortraitUpdatedAt, uint?>(r => r.PortraitUpdatedAt);
set => SetSplitEntityValue<SplitPortraitUpdatedAt, uint?>(value, (r, v) => r.PortraitUpdatedAt = v,
() => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, PortraitUpdatedAt = value});
}
public ushort? Gender { get; set; }
public byte[]? Icon { get; set; }
public string? IpGeolocation { get; set; }
[NotMapped] public string? IpGeolocation
{
get => GetSplitEntityValue<SplitIpGeolocation, string?>(r => r.IpGeolocation);
set => SetSplitEntityValue<SplitIpGeolocation, string?>(value, (r, v) => r.IpGeolocation = v,
() => new() {TakenAt = TakenAt, Uid = Uid, TriggeredBy = TriggeredBy, IpGeolocation = value});
}

public class SplitDisplayName : BaseUserRevision
{
public string? DisplayName { get; set; }
}
public class SplitPortraitUpdatedAt : BaseUserRevision
{
public uint? PortraitUpdatedAt { get; set; }
}
public class SplitIpGeolocation : BaseUserRevision
{
public string? IpGeolocation { get; set; }
}
}
}
42 changes: 13 additions & 29 deletions crawler/src/Db/TbmDbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,35 +40,19 @@ protected override void OnModelCreating(ModelBuilder b)
b.Entity<ReplyContent>().ToTable($"tbmc_f{Fid}_reply_content");
b.Entity<SubReplyPost>().ToTable($"tbmc_f{Fid}_subReply");
b.Entity<SubReplyContent>().ToTable($"tbmc_f{Fid}_subReply_content");
b.Entity<ThreadRevision>()
.SplitToTable("tbmc_revision_thread_viewCount", tb => tb.Property(e => e.ViewCount))
.ToTable("tbmc_revision_thread").HasKey(e => new {e.Tid, e.TakenAt});
b.Entity<ReplyRevision>()
.SplitToTable("tbmc_revision_reply_agreeCount", tb => tb.Property(e => e.AgreeCount))
.SplitToTable("tbmc_revision_reply_subReplyCount", tb => tb.Property(e => e.SubReplyCount))
.SplitToTable("tbmc_revision_reply_floor", tb => tb.Property(e => e.Floor))
.ToTable("tbmc_revision_reply").HasKey(e => new {e.Pid, e.TakenAt});
b.Entity<SubReplyRevision>()
.SplitToTable("tbmc_revision_subReply_agreeCount", tb => tb.Property(e => e.AgreeCount))
.SplitToTable("tbmc_revision_subReply_disagreeCount", tb => tb.Property(e => e.DisagreeCount))
.ToTable("tbmc_revision_subReply").HasKey(e => new {e.Spid, e.TakenAt});
b.Entity<UserRevision>()
.SplitToTable("tbmc_revision_user_ipGeolocation", tb =>
{
tb.Property(e => e.TriggeredBy);
tb.Property(e => e.IpGeolocation);
})
.SplitToTable("tbmc_revision_user_portraitUpdatedAt", tb =>
{
tb.Property(e => e.TriggeredBy);
tb.Property(e => e.PortraitUpdatedAt);
})
.SplitToTable("tbmc_revision_user_displayName", tb =>
{
tb.Property(e => e.TriggeredBy);
tb.Property(e => e.DisplayName);
})
.ToTable("tbmc_revision_user").HasKey(e => new {e.Uid, e.TakenAt});
b.Entity<ThreadRevision>().ToTable("tbmc_revision_thread").HasKey(e => new {e.Tid, e.TakenAt});
b.Entity<ThreadRevision.SplitViewCount>().ToTable("tbmc_revision_thread_viewCount").HasKey(e => new {e.Tid, e.TakenAt});
b.Entity<ReplyRevision>().ToTable("tbmc_revision_reply").HasKey(e => new {e.Pid, e.TakenAt});
b.Entity<ReplyRevision.SplitAgreeCount>().ToTable("tbmc_revision_reply_agreeCount").HasKey(e => new {e.Pid, e.TakenAt});
b.Entity<ReplyRevision.SplitSubReplyCount>().ToTable("tbmc_revision_reply_subReplyCount").HasKey(e => new {e.Pid, e.TakenAt});
b.Entity<ReplyRevision.SplitFloor>().ToTable("tbmc_revision_reply_floor").HasKey(e => new {e.Pid, e.TakenAt});
b.Entity<SubReplyRevision>().ToTable("tbmc_revision_subReply").HasKey(e => new {e.Spid, e.TakenAt});
b.Entity<SubReplyRevision.SplitAgreeCount>().ToTable("tbmc_revision_subReply_agreeCount").HasKey(e => new {e.Spid, e.TakenAt});
b.Entity<SubReplyRevision.SplitDisagreeCount>().ToTable("tbmc_revision_subReply_disagreeCount").HasKey(e => new {e.Spid, e.TakenAt});
b.Entity<UserRevision>().ToTable("tbmc_revision_user").HasKey(e => new {e.Uid, e.TakenAt});
b.Entity<UserRevision.SplitIpGeolocation>().ToTable("tbmc_revision_user_ipGeolocation").HasKey(e => new {e.Uid, e.TakenAt});
b.Entity<UserRevision.SplitPortraitUpdatedAt>().ToTable("tbmc_revision_user_portraitUpdatedAt").HasKey(e => new {e.Uid, e.TakenAt});
b.Entity<UserRevision.SplitDisplayName>().ToTable("tbmc_revision_user_displayName").HasKey(e => new {e.Uid, e.TakenAt});
b.Entity<AuthorExpGradeRevision>().ToTable("tbmc_revision_authorExpGrade").HasKey(e => new {e.Fid, e.Uid, e.DiscoveredAt});
b.Entity<ForumModeratorRevision>().ToTable("tbmc_revision_forumModerator");
b.Entity<Forum>().ToTable("tbm_forum");
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Saver/BaseSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ protected SaverChangeSet<TPost> SavePosts<TRevision>(TbmDbContext db,
ExpressionStarter<TPost> existingPostPredicate,
Func<IEnumerable<TRevision>, Expression<Func<TRevision, bool>>> existingRevisionPredicate,
Expression<Func<TRevision, TRevision>> revisionKeySelector)
where TRevision : BaseRevision, new()
where TRevision : class, IRevision, new()
{
var dbSet = db.Set<TPost>().TagWith("ForUpdate");
if (dbSet == null) throw new ArgumentException(
Expand Down
9 changes: 7 additions & 2 deletions crawler/src/Tieba/Crawl/Saver/CommonInSavers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ protected void SavePostsOrUsers<TPostOrUser, TRevision>(
Func<TRevision, long> revisionPostOrUserIdSelector,
Func<IEnumerable<TRevision>, Expression<Func<TRevision, bool>>> existingRevisionPredicate,
Expression<Func<TRevision, TRevision>> revisionKeySelector)
where TPostOrUser : class where TRevision : BaseRevision, new()
where TPostOrUser : class where TRevision : class, IRevision, new()
{
db.Set<TPostOrUser>().AddRange(existingOrNewLookup[false]); // newly added
db.TimestampingEntities();
var newRevisions = existingOrNewLookup[true].Select(newPostOrUser =>
{
var postOrUserInTracking = existingSelector(newPostOrUser);
Expand Down Expand Up @@ -101,7 +102,11 @@ or nameof(ITimestampingEntity.CreatedAt)
if (revision != null) revision.NullFieldsBitMask = (ushort?)revisionNullFieldsBitMask;
return revision;
}).OfType<TRevision>().ToList();
db.TimestampingEntities();

db.AddRange(newRevisions.OfType<ThreadRevision>().Select(r => r.GetSplitEntities()));
db.AddRange(newRevisions.OfType<ReplyRevision>().Select(r => r.GetSplitEntities()));
db.AddRange(newRevisions.OfType<SubReplyRevision>().Select(r => r.GetSplitEntities()));
db.AddRange(newRevisions.OfType<UserRevision>().Select(r => r.GetSplitEntities()));

if (!newRevisions.Any()) return; // quick exit to prevent execute sql with WHERE FALSE clause
var existingRevisions = db.Set<TRevision>()
Expand Down

0 comments on commit a75c8ee

Please sign in to comment.