Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove extra body tags from processed contents #165

Merged
merged 1 commit into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Behaviour/HtmlCrawlerManagerInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interface HtmlCrawlerManagerInterface
/**
* Get HTML Crawler for the given property (creates it if needed)
*/
public function get(Content $content, array &$data, string $property): ?Crawler;
public function get(Content $content, array $data, string $property): ?Crawler;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Micro BC-break I guess, but seems fair.


/**
* Dump the current state of the HTML Crawler into data for the given property.
Expand Down
6 changes: 3 additions & 3 deletions src/Processor/HtmlIdProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,21 @@ public function __invoke(array &$data, Content $content): void

private function setIdFromContent(\DOMElement $element): void
{
if (!$id = $element->getAttribute('id')) {
if (!$element->getAttribute('id')) {
$element->setAttribute('id', $this->slugify($element->textContent));
}
}

private function setIdFromHashedContent(\DOMElement $element): void
{
if (!$id = $element->getAttribute('id')) {
if (!$element->getAttribute('id')) {
$element->setAttribute('id', $this->hash($element->textContent));
}
}

private function setIdForImage(\DOMElement $element): void
{
if (!$id = $element->getAttribute('id')) {
if (!$element->getAttribute('id')) {
$name = $element->getAttribute('alt') ?: basename($element->getAttribute('src'));
$element->setAttribute('id', $this->slugify($name));
}
Expand Down
8 changes: 4 additions & 4 deletions src/Service/NaiveHtmlCrawlerManager.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class NaiveHtmlCrawlerManager implements HtmlCrawlerManagerInterface
*/
private array $crawlers = [];

public function get(Content $content, array &$data, string $property): ?Crawler
public function get(Content $content, array $data, string $property): ?Crawler
{
$key = "{$content->getType()}:{$content->getSlug()}";
$crawler = $this->createCrawler($data[$property]);
Expand All @@ -40,16 +40,16 @@ public function save(Content $content, array &$data, string $property): void
$key = "{$content->getType()}:{$content->getSlug()}";

if (isset($this->crawlers[$key][$property])) {
$data[$property] = $this->crawlers[$key][$property]->html();
$data[$property] = $this->crawlers[$key][$property]->filterXPath('//body')->first()->html();
unset($this->crawlers[$key][$property]);
}
}

public function saveAll(Content $content, array &$data): void
{
foreach ($this->crawlers as $key => $crawlers) {
foreach ($this->crawlers as $crawlers) {
foreach ($crawlers as $property => $crawler) {
$data[$property] = $crawler->html();
$data[$property] = $crawler->filterXPath('//body')->first()->html();
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/Service/Parsedown.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ protected function blockAdmonition($line, $block = null)

return $block;
}

return null;
}

protected function blockAdmonitionContinue($line, $block = null)
{
// A blank newline has occurred, or text without indent:
if (isset($block['interrupted']) || $line['indent'] < 4) {
return;
return null;
}

$previous = $block['$admonitionContentRef'] ?? "\n";
Expand Down
4 changes: 2 additions & 2 deletions src/Service/SharedHtmlCrawlerManager.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class SharedHtmlCrawlerManager implements HtmlCrawlerManagerInterface
*/
private array $crawlers = [];

public function get(Content $content, array &$data, string $property): ?Crawler
public function get(Content $content, array $data, string $property): ?Crawler
{
$key = "{$content->getType()}:{$content->getSlug()}";

Expand Down Expand Up @@ -54,7 +54,7 @@ public function saveAll(Content $content, array &$data): void
}

foreach ($this->crawlers[$key] as $property => $crawler) {
$data[$property] = $crawler->html();
$data[$property] = $crawler->filterXPath('//body')->first()->html();
}

unset($this->crawlers[$key]);
Expand Down
6 changes: 5 additions & 1 deletion tests/Unit/Processor/ResolveContentLinksProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ public function testResolveLinks(): void
<a href="/other-contents-route-path/another-contents#some-anchor">Another content with anchor</a>
</body>
HTML,
$data['content']
<<<HTML
<body>
{$data['content']}
</body>
HTML,
);
}
}
117 changes: 117 additions & 0 deletions tests/Unit/Service/HtmlCrawlerManagerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
<?php

/*
* This file is part of the "StenopePHP/Stenope" bundle.
*
* @author Thomas Jarrand <[email protected]>
*/

namespace Stenope\Bundle\Tests\Unit\Service;

use PHPUnit\Framework\TestCase;
use Stenope\Bundle\Behaviour\HtmlCrawlerManagerInterface;
use Stenope\Bundle\Content;
use Stenope\Bundle\Service\NaiveHtmlCrawlerManager;
use Stenope\Bundle\Service\SharedHtmlCrawlerManager;

class HtmlCrawlerManagerTest extends TestCase
{
/**
* @dataProvider provideNoExtraBodyData
*/
public function testSaveNoExtraBody(
HtmlCrawlerManagerInterface $manager,
string $html,
string $expected
): void {
if ($manager instanceof SharedHtmlCrawlerManager) {
$this->markTestSkipped('SharedHtmlCrawlerManager does nothing on save()');
}

$content = new Content('slug', 'type', $html, 'html');
$data = ['content' => $html];

$manager->get($content, $data, 'content');
$manager->save($content, $data, 'content');

self::assertXmlStringEqualsXmlString(<<<HTML
<html>$expected</html>
HTML,
<<<HTML
<html>{$data['content']}</html>
HTML,
);
}

/**
* @dataProvider provideNoExtraBodyData
*/
public function testSaveAllNoExtraBody(
HtmlCrawlerManagerInterface $manager,
string $html,
string $expected
): void {
$content = new Content('slug', 'type', $html, 'html');
$data = ['content' => $html];

$manager->get($content, $data, 'content');
$manager->saveAll($content, $data);

self::assertXmlStringEqualsXmlString(<<<HTML
<html>$expected</html>
HTML,
<<<HTML
<html>{$data['content']}</html>
HTML,
);
}

public function provideNoExtraBodyData(): iterable
{
$html = <<<HTML
<html>
<head>
<title>My title</title>
</head>
<body>
<h1>My title</h1>
<p>My content</p>
</body>
</html>
HTML;

$expected = <<<HTML
<h1>My title</h1>
<p>My content</p>
HTML;

yield 'with full HTML and naive manager' => [
new NaiveHtmlCrawlerManager(),
$html,
$expected,
];

yield 'with full HTML and shared manager' => [
new SharedHtmlCrawlerManager(),
$html,
$expected,
];

$html = <<<HTML
<h1>My title</h1>
<p>My content</p>
HTML;

yield 'with partial HTML and naive manager' => [
new NaiveHtmlCrawlerManager(),
$html,
$expected,
];

yield 'with partial HTML and shared manager' => [
new SharedHtmlCrawlerManager(),
$html,
$expected,
];
}
}