Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extends RegexParse plugin #1096

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 149 additions & 38 deletions lib/LANraragi/Plugin/Metadata/RegexParse.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,64 @@ use warnings;
#Plugins can freely use all Perl packages already installed on the system
#Try however to restrain yourself to the ones already installed for LRR (see tools/cpanfile) to avoid extra installations by the end-user.
use File::Basename;
use Scalar::Util qw(looks_like_number);

#You can also use the LRR Internal API when fitting.
use LANraragi::Model::Plugins;
use LANraragi::Utils::Database qw(redis_encode redis_decode);
use LANraragi::Utils::Logging qw(get_plugin_logger);
use LANraragi::Utils::String qw(trim);
use LANraragi::Utils::Logging qw(get_plugin_logger);
use LANraragi::Utils::String qw(trim);
use Scalar::Util qw(looks_like_number);

my $PLUGIN_TAG_NS = 'parsed:';

# consider using Locale::Language / Locale::Script
my %VALID_LANGUAGES = (
'chi' => 'chinese', # ?
'chinese' => 'chinese',
'de' => 'german',
'deu' => 'german',
'en' => 'english',
'eng' => 'english',
'english' => 'english',
'es' => 'spanish',
'fr' => 'french',
'fra' => 'french',
'fre' => 'french',
'french' => 'french',
'ger' => 'german', # ?
'german' => 'german',
'it' => 'italian',
'ita' => 'italian',
'italian' => 'italian',
'ja' => 'japanese',
'japanese' => 'japanese',
'jpn' => 'japanese',
'ko' => 'korean',
'kor' => 'korean',
'korean' => 'korean',
'pl' => 'polish',
'pol' => 'polish',
'polish' => 'polish',
'ru' => 'russian',
'rus' => 'russian',
'russian' => 'russian',
'spa' => 'spanish',
'spanish' => 'spanish',
'zh' => 'chinese',
'zh' => 'chinese',
'zho' => 'chinese',
'textless' => 'textless'
);

my %COMMON_EXTRANEOUS_VALUES = (
'uncensored' => 1,
'decensored' => 1,
'ongoing' => 1,
'pixiv' => 1,
'twitter' => 1,
'fanbox' => 1,
'cosplay' => 1,
'digital' => 1
);

#Meta-information about your plugin.
sub plugin_info {
Expand All @@ -23,34 +74,49 @@ sub plugin_info {
type => "metadata",
namespace => "regexplugin",
author => "Difegue",
version => "1.0",
version => "1.0.1",
description =>
"Derive tags from the filename of the given archive. <br>Follows the doujinshi naming standard (Release) [Artist] TITLE (Series) [Language].",
icon =>
"",
parameters => [ ]
parameters => [
{ type => "bool", desc => "Capture trailing tags in curly brackets" },
IceBreeze marked this conversation as resolved.
Show resolved Hide resolved
{ type => "bool",
desc => "Keep everything you catch as tags in the namespace \"${PLUGIN_TAG_NS}\"<BR />"
IceBreeze marked this conversation as resolved.
Show resolved Hide resolved
. "(this should be used in conjunction with Tag Rules)"
}
],
);

}

#Mandatory function to be implemented by your plugin
sub get_tags {
my ( undef, $lrr_info, $check_trailing_tags, $keep_all_captures ) = @_;

shift;
my $lrr_info = shift; # Global info hash
# lrr_info's file_path is taken straight from the filesystem, which might not be proper UTF-8.
# Run a decode to make sure we can derive tags with the proper encoding.
my $file = Mojo::File->new( redis_decode( $lrr_info->{'file_path'} ) );
my $filename = $file->basename( $file->extname );

my ( $tags, $title ) = parse_filename(
$filename,
{ 'check_trailing_tags' => $check_trailing_tags,
'keep_all_captures' => $keep_all_captures
}
);

my $logger = get_plugin_logger();
my $file = $lrr_info->{file_path};
$logger->info("Sending the following tags to LRR: $tags");
$logger->info("Parsed title is $title");

# lrr_info's file_path is taken straight from the filesystem, which might not be proper UTF-8.
# Run a decode to make sure we can derive tags with the proper encoding.
$file = redis_decode($file);
return ( tags => $tags, title => $title );
}

# Get the filename from the file_path info field
my ( $filename, $filepath, $suffix ) = fileparse( $file, qr/\.[^.]*/ );
sub parse_filename {
my ( $filename, $params ) = @_;

my ( $event, $artist, $title, $series, $language );
$event = $artist = $title = $series = $language = "";
my ( $event, $artist, $title, $series, $language, $trailing_tags, $other_captures );

#Replace underscores with spaces
$filename =~ s/_/ /g;
Expand All @@ -64,42 +130,87 @@ sub get_tags {
if ( defined $5 ) { $title = trim($5); }
if ( defined $7 ) { $series = $7; }
if ( defined $9 ) { $language = $9; }
my $tail = trim( $+{'tail'} );

if ($tail) {

# match trailing_tags (...{Tags}.ext)
if ( $params->{'check_trailing_tags'} ) {
$tail =~ /(?<head>.*)\{(?<ttags>[^\}]*)\}$/;
$trailing_tags = $+{'ttags'};
$tail = $+{'head'};
}

my @tags = ();
# match any remaining parenthesis
if ( $tail && $params->{'keep_all_captures'} ) {
my @items = ( $tail =~ /\(([^\)]+)\)|\{([^}]+)\}|\[([^\]]+)\]/g );
$other_captures = join( ',', grep { trim($_) } @items );
}
}

my @tags;

if ( $event ne "" ) {
push @tags, "event:$event";
push @tags, parse_artist_value($artist) if ($artist);
push @tags, "event:$event" if ($event);
push @tags, parse_language_value($language) if ($language);
push @tags, parse_captured_value_for_namespace( $series, 'series:' ) if ($series);
push @tags, parse_captured_value_for_namespace( $other_captures, $PLUGIN_TAG_NS ) if ($other_captures);
push @tags, parse_captured_value_for_namespace( $trailing_tags, '' ) if ($trailing_tags);

if ( !$params->{'keep_all_captures'} ) {
@tags = grep { !m/^\Q$PLUGIN_TAG_NS/ } @tags;
}

if ( $artist ne "" ) {
return ( join( ", ", sort @tags ), trim($title) );
}

#Special case for circle/artist sets:
#If the string contains parenthesis, what's inside those is the artist name
#the rest is the circle.
if ( $artist =~ /(.*) \((.*)\)/ ) {
push @tags, "group:$1";
push @tags, "artist:$2";
sub parse_language_value {
my ($language) = @_;
my @tags;
my @maybe_languages = map { trim( lc $_ ) } split( m/,/, $language );
IceBreeze marked this conversation as resolved.
Show resolved Hide resolved
foreach my $item (@maybe_languages) {
next if ( !$item );
my $lang = $VALID_LANGUAGES{$item};
if ($lang) {
push @tags, "language:$lang";
} else {
push @tags, "artist:$artist";
push @tags, "${PLUGIN_TAG_NS}${item}";
}
}
return @tags;
}

if ( $series ne "" ) {
push @tags, "series:$series";
}
sub parse_artist_value {
my ($artist) = @_;

my @tags;

# Don't push numbers as tags for language.
unless ( $language eq "" || looks_like_number($language) ) {
push @tags, "language:$language";
#Special case for circle/artist sets:
#If the string contains parenthesis, what's inside those is the artist name
#the rest is the circle.
if ( $artist =~ /(.*) \((.*)\)/ ) {
push @tags, "group:$1"; # split group?
$artist = $2;
}
push @tags, parse_captured_value_for_namespace( $artist, 'artist:' );

my $tagstring = join( ", ", @tags );
return @tags;
}

$logger->info("Sending the following tags to LRR: $tagstring");
sub parse_captured_value_for_namespace {
my ( $capture, $namespace ) = @_;
return map { _classify_item( trim($_), $namespace ) } split( m/,/, $capture );
}

$logger->info("Parsed title is $title");
return ( tags => $tagstring, title => $title );
sub _classify_item {
my ( $item, $namespace ) = @_;

# if the namespace is specified, we are able to exclude some common words,
# otherwise we are dealing with simple tags
if ( $namespace && $COMMON_EXTRANEOUS_VALUES{ lc $item } || looks_like_number($item) ) {
return $PLUGIN_TAG_NS . lc $item;
}
return "${namespace}${item}";
}

#Regular Expression matching the E-Hentai standard: (Release) [Artist] TITLE (Series) [Language]
Expand All @@ -115,7 +226,7 @@ sub get_tags {
#(\(([^([)]+)\))? returns the content of (Series). Optional.
#(\[([^]]+)\])? returns the content of [Language]. Optional.
#\s* indicates zero or more whitespaces.
my $regex = qr/(\(([^([]+)\))?\s*(\[([^]]+)\])?\s*([^([]+)\s*(\(([^([)]+)\))?\s*(\[([^]]+)\])?/;
my $regex = qr/(\(([^([]+)\))?\s*(\[([^]]+)\])?\s*([^([]+)\s*(\(([^([)]+)\))?\s*(\[([^]]+)\])?(?<tail>.*)?/;
IceBreeze marked this conversation as resolved.
Show resolved Hide resolved
sub get_regex { return $regex }

1;
Loading