diff --git a/PATCHES/remove-tcpdf-footer.patch b/PATCHES/remove-tcpdf-footer.patch new file mode 100644 index 000000000..707544eff --- /dev/null +++ b/PATCHES/remove-tcpdf-footer.patch @@ -0,0 +1,39 @@ +diff --git a/src/MyPdf.php b/src/MyPdf.php +index 082131a..bb3cb96 100644 +--- a/src/MyPdf.php ++++ b/src/MyPdf.php +@@ -21,6 +21,7 @@ class MyPdf extends TCPDF + protected $_transf = array(); + protected $_myLastPageGroup = null; + protected $_myLastPageGroupNb = 0; ++ protected $tcpdflink = false; + + // used to make a radius with bezier : (4/3 * (sqrt(2) - 1)) + const MY_ARC = 0.5522847498; +@@ -267,7 +268,7 @@ class MyPdf extends TCPDF + $cornerBL = null, + $cornerBR = null + ) { +- ++ + // init the path + $path = ''; + +@@ -1087,7 +1088,7 @@ class MyPdf extends TCPDF + $drawFirst = true, + $trans = false + ) { +- ++ + // if we want the no trigo direction : add 2PI to the begin angle, to invert the direction + if (!$direction) { + $angleBegin+= M_PI*2.; +@@ -1387,7 +1388,7 @@ class MyPdf extends TCPDF + $page = null, + $fontName = 'helvetica' + ) { +- ++ + // bookmark the Title if wanted + if ($bookmarkTitle) { + $this->Bookmark($titre, 0, -1); diff --git a/README.md b/README.md index e0fdf4cfc..ee628d0fb 100644 --- a/README.md +++ b/README.md @@ -96,3 +96,25 @@ XDEBUG_MODE=coverage vendor/bin/phpunit # without coverage vendor/bin/phpunit ``` + +## For dev + +```bash +drush rapi-i --alias job --verbose +drush rw-job:index --verbose + +cget ocha_ai.settings --include-overridden +cget ocha_ai_tag.settings --include-overridden +cget ocha_ai_chat.settings --include-overridden +cget reliefweb_api.settings --include-overridden + +cset ocha_ai.settings plugins.source.reliefweb.api_url https://dev.api-reliefweb-int.ahconu.org/v1 +cset ocha_ai.settings plugins.source.reliefweb.converter_url https://xxx:xxx@dev.reliefweb-int.ahconu.org/search/converter/json +cset reliefweb_api.settings api_url https://dev.api-reliefweb-int.ahconu.org/v1 +cset reliefweb_api.settings api_url_external https://dev.api-reliefweb-int.ahconu.org/v1 +cset reliefweb_api.settings website: https://dev.reliefweb-int.ahconu.org + +queue:list +queue:run --verbose reliefweb_job_tagger + +``` diff --git a/composer.json b/composer.json index 4db66b68e..288dc9817 100644 --- a/composer.json +++ b/composer.json @@ -78,6 +78,7 @@ "reliefweb/api-indexer": "^v2.8", "reliefweb/simple-autocomplete": "^v1.3", "reliefweb/simple-datepicker": "^v1.3", + "spipu/html2pdf": "^5.2", "symfony/uid": "^6.2", "unocha/common_design": "^9.4", "unocha/gtm_barebones": "^1.1", diff --git a/composer.lock b/composer.lock index c31076985..c0d8ff9fc 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "fa83edf0d251608ba959078e1ce18d02", + "content-hash": "ce442efcb68ed1c8c63a469cd7f7566a", "packages": [ { "name": "asm89/stack-cors", @@ -3761,7 +3761,7 @@ "homepage": "https://www.drupal.org/user/213194" }, { - "name": "TR", + "name": "tr", "homepage": "https://www.drupal.org/user/202830" }, { @@ -4207,7 +4207,7 @@ ], "authors": [ { - "name": "Cellar Door", + "name": "cellar door", "homepage": "https://www.drupal.org/user/658076" }, { @@ -13400,6 +13400,63 @@ ], "time": "2024-03-09T15:20:58+00:00" }, + { + "name": "spipu/html2pdf", + "version": "v5.2.8", + "source": { + "type": "git", + "url": "https://github.com/spipu/html2pdf.git", + "reference": "6c94dcd48c94c6c73f206629839c1ebd81e8c726" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spipu/html2pdf/zipball/6c94dcd48c94c6c73f206629839c1ebd81e8c726", + "reference": "6c94dcd48c94c6c73f206629839c1ebd81e8c726", + "shasum": "" + }, + "require": { + "ext-gd": "*", + "ext-mbstring": "*", + "php": "^5.6 || ^7.0 || ^8.0", + "tecnickcom/tcpdf": "^6.3" + }, + "require-dev": { + "phpunit/phpunit": "^5.0 || ^9.0" + }, + "suggest": { + "ext-gd": "Allows to embed images into the PDF", + "fagundes/zff-html2pdf": "if you need to integrate Html2Pdf with Zend Framework 2 (zf2)" + }, + "type": "library", + "autoload": { + "psr-4": { + "Spipu\\Html2Pdf\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "OSL-3.0" + ], + "authors": [ + { + "name": "Spipu", + "homepage": "https://github.com/spipu", + "role": "Developer" + } + ], + "description": "Html2Pdf is a HTML to PDF converter written in PHP5 (it uses TCPDF). OFFICIAL PACKAGE", + "homepage": "http://html2pdf.fr/", + "keywords": [ + "html", + "html2pdf", + "pdf" + ], + "support": { + "issues": "https://github.com/spipu/html2pdf/issues", + "source": "https://github.com/spipu/html2pdf/tree/v5.2.8" + }, + "time": "2023-07-18T14:52:59+00:00" + }, { "name": "squizlabs/php_codesniffer", "version": "3.10.2", @@ -16788,6 +16845,78 @@ ], "time": "2024-08-12T09:55:28+00:00" }, + { + "name": "tecnickcom/tcpdf", + "version": "6.7.5", + "source": { + "type": "git", + "url": "https://github.com/tecnickcom/TCPDF.git", + "reference": "951eabf0338ec2522bd0d5d9c79b08a3a3d36b36" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/tecnickcom/TCPDF/zipball/951eabf0338ec2522bd0d5d9c79b08a3a3d36b36", + "reference": "951eabf0338ec2522bd0d5d9c79b08a3a3d36b36", + "shasum": "" + }, + "require": { + "php": ">=5.5.0" + }, + "type": "library", + "autoload": { + "classmap": [ + "config", + "include", + "tcpdf.php", + "tcpdf_parser.php", + "tcpdf_import.php", + "tcpdf_barcodes_1d.php", + "tcpdf_barcodes_2d.php", + "include/tcpdf_colors.php", + "include/tcpdf_filters.php", + "include/tcpdf_font_data.php", + "include/tcpdf_fonts.php", + "include/tcpdf_images.php", + "include/tcpdf_static.php", + "include/barcodes/datamatrix.php", + "include/barcodes/pdf417.php", + "include/barcodes/qrcode.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0-or-later" + ], + "authors": [ + { + "name": "Nicola Asuni", + "email": "info@tecnick.com", + "role": "lead" + } + ], + "description": "TCPDF is a PHP class for generating PDF documents and barcodes.", + "homepage": "http://www.tcpdf.org/", + "keywords": [ + "PDFD32000-2008", + "TCPDF", + "barcodes", + "datamatrix", + "pdf", + "pdf417", + "qrcode" + ], + "support": { + "issues": "https://github.com/tecnickcom/TCPDF/issues", + "source": "https://github.com/tecnickcom/TCPDF/tree/6.7.5" + }, + "funding": [ + { + "url": "https://www.paypal.com/cgi-bin/webscr?cmd=_donations¤cy_code=GBP&business=paypal@tecnick.com&item_name=donation%20for%20tcpdf%20project", + "type": "custom" + } + ], + "time": "2024-04-20T17:25:10+00:00" + }, { "name": "theseer/tokenizer", "version": "1.2.3", @@ -18032,5 +18161,5 @@ "php": ">=8.2" }, "platform-dev": [], - "plugin-api-version": "2.6.0" + "plugin-api-version": "2.3.0" } diff --git a/composer.patches.json b/composer.patches.json index 57f3ab49c..58ff98e70 100644 --- a/composer.patches.json +++ b/composer.patches.json @@ -36,6 +36,9 @@ "drush/drush": { "https://humanitarian.atlassian.net/browse/OPS-8026": "PATCHES/drush--timeout-override.patch" }, + "spipu/html2pdf": { + "Remove promo link": "PATCHES/remove-tcpdf-footer.patch" + }, "unocha/common_design": { "https://humanitarian.atlassian.net/browse/CD-519": "PATCHES/common_design--default-logo-remove.patch" } diff --git a/html/modules/custom/reliefweb_semantic/README.md b/html/modules/custom/reliefweb_semantic/README.md new file mode 100644 index 000000000..ab4dc9166 --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/README.md @@ -0,0 +1,76 @@ +# ReliefWeb - Semantic module + +This module provides integration with the ReliefWeb Semantic API. + +## To do + +- [x] Add service to query API + +## AWS + +Dashboards: + +- https://us-east-1.console.aws.amazon.com/s3/lens/dashboard/RW-KB?region=us-east-1&bucketType=general + +Config: + +- Role: `BedrockRoleKbRw` +- Collection: `arn:aws:aoss:us-east-1:694216630861:collection/b2h8ajgjb3x87ur892hc` +- Vector field: `embedding` +- Text field name: `AMAZON_BEDROCK_TEXT_CHUNK` +- Metadata field name: `AMAZON_BEDROCK_METADATA` + +All content is using a single KB, current Id is `VIEPSPYNSS` + +## Drush + +```bash +drush reliefweb-semantic:index Index content in the ReliefWeb API. +drush reliefweb-semantic:list-kbs List kbs. +drush reliefweb-semantic:list-datasources List datasources. +drush reliefweb-semantic:list-jobs List ingestion jobs. +drush reliefweb-semantic:trigger-sync Trigger sync.. +drush reliefweb-semantic:query-kb --id=WYBGQOFQLN --q="Any jobs in Europe" +drush reliefweb-semantic:list-apikeys +``` + +## Openseach + +```json +GET /bedrock-knowledge-base-default-index/_search +{ + "query": { + "match_all": {} + } +} +``` + +```json +GET /bedrock-knowledge-base-default-index/_search +{ + "query": { + "match": { + "country": "*" + } + + } +} +``` + +```json +GET /bedrock-knowledge-base-default-index/_search +{ + "query": { + "match": { + "title": { + "query": "china", + "fuzziness": "AUTO" + } + } + } +} +``` + +## Questions + +- Describe the weather in china in june 2024 diff --git a/html/modules/custom/reliefweb_semantic/drush.services.yml b/html/modules/custom/reliefweb_semantic/drush.services.yml new file mode 100644 index 000000000..50273e477 --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/drush.services.yml @@ -0,0 +1,12 @@ +services: + reliefweb_semantic.commands: + class: \Drupal\reliefweb_semantic\Commands\ReliefWebSemanticCommands + arguments: ['@config.factory', '@entity_field.manager', '@entity_type.manager', '@module_handler', '@state', '@http_client', '@file_system', '@renderer'] + tags: + - { name: drush.command } + reliefweb_semantic.aws_commands: + class: \Drupal\reliefweb_semantic\Commands\ReliefWebSemanticAwsCommands + arguments: ['@config.factory', '@state', '@reliefweb_semantic.search_service'] + tags: + - { name: drush.command } + diff --git a/html/modules/custom/reliefweb_semantic/reliefweb_semantic.info.yml b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.info.yml new file mode 100644 index 000000000..4e7d3e7e5 --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.info.yml @@ -0,0 +1,5 @@ +type: module +name: ReliefWeb Semantic +description: 'Provides integration with the ReliefWeb Semantic API.' +package: reliefweb +core_version_requirement: ^9 || ^10 diff --git a/html/modules/custom/reliefweb_semantic/reliefweb_semantic.module b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.module new file mode 100644 index 000000000..11946462c --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.module @@ -0,0 +1,52 @@ +get('aws_bedrock_region'); + $role_arn = $config->get('aws_bedrole_role_arn', NULL); + + if (!empty($role_arn)) { + $stsClient = new StsClient([ + 'region' => $region, + 'version' => 'latest', + ]); + + $result = $stsClient->AssumeRole([ + 'RoleArn' => $role_arn, + 'RoleSessionName' => 'aws-bedrock-ocha-ai-summarize', + ]); + + $credentials = [ + 'key' => $result['Credentials']['AccessKeyId'], + 'secret' => $result['Credentials']['SecretAccessKey'], + 'token' => $result['Credentials']['SessionToken'], + ]; + } + else { + $credentials = [ + 'key' => $config->get('bedrock_access_key'), + 'secret' => $config->get('bedrock_secret_key'), + ]; + } + + return [ + 'credentials' => $credentials, + 'region' => $region, + ]; +} diff --git a/html/modules/custom/reliefweb_semantic/reliefweb_semantic.services.yml b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.services.yml new file mode 100644 index 000000000..5cb4f87c4 --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/reliefweb_semantic.services.yml @@ -0,0 +1,4 @@ +services: + reliefweb_semantic.search_service: + class: Drupal\reliefweb_semantic\Services\ReliefWebSemanticService + arguments: [] diff --git a/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticAwsCommands.php b/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticAwsCommands.php new file mode 100644 index 000000000..d524e15ad --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticAwsCommands.php @@ -0,0 +1,362 @@ +config = $config_factory->get('reliefweb_semantic.settings'); + $this->state = $state; + $this->rwService = $rw_service; + } + + /** + * List kbs. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:list-kbs + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:list-kbs + * List kbs. + */ + public function listKbs( + array $options = [ + 'reset' => 0, + 'format' => 'table', + ], + ) : RowsOfFields { + $data = $this->getKbs($options['reset']); + return new RowsOfFields($data); + } + + /** + * List datasources. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:list-datasources + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:list-datasources + * List datasources. + */ + public function listDatasources( + array $options = [ + 'reset' => 0, + 'format' => 'table', + ], + ) : RowsOfFields { + $data = $this->getDatasources($options['reset']); + return new RowsOfFields($data); + } + + /** + * List ingestion jobs. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:list-jobs + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:list-jobs + * List jobs. + */ + public function listJobs( + array $options = [ + 'id' => 0, + 'format' => 'table', + ], + ) : null|RowsOfFields { + $aws_options = reliefweb_semantic_get_aws_client_options(); + $bedrock = new BedrockAgentClient($aws_options); + + if (empty($options['id'])) { + return NULL; + } + + $datasources = $this->getDatasources(); + $result = $bedrock->listIngestionJobs([ + 'dataSourceId' => $datasources[$options['id']]['id'], + 'knowledgeBaseId' => $datasources[$options['id']]['kb_id'], + ]); + + $jobs = $result->toArray()['ingestionJobSummaries']; + $data = []; + + foreach ($jobs as $job) { + $data[$job['ingestionJobId']] = [ + 'id' => $job['ingestionJobId'], + 'status' => $job['status'], + 'updated' => $job['updatedAt'], + 'numberOfDocumentsScanned' => $job['statistics']['numberOfDocumentsScanned'], + 'numberOfDocumentsFailed' => $job['statistics']['numberOfDocumentsFailed'], + 'numberOfNewDocumentsIndexed' => $job['statistics']['numberOfNewDocumentsIndexed'], + ]; + } + + return new RowsOfFields($data); + } + + /** + * Query KB. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:query-kb + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:query-kb id + * List jobs. + */ + public function queryKb( + array $options = [ + 'id' => 0, + 'q' => 0, + 'format' => 'table', + 'theme' => '', + 'country' => '', + ], + ) : null|RowsOfFields { + if (empty($options['id'])) { + return NULL; + } + + $result = $this->rwService->queryKb($options['id'], $options['q'], $options['theme'], $options['country']); + $data = []; + + foreach ($result as $item) { + $data[$item['id']] = [ + 'id' => $item['id'], + 'title' => $item['title'], + 'score' => round(100 * $item['score'], 2) . '%', + 'file' => $item['file'], + 'theme' => implode(', ', $item['theme'] ?? []), + 'country' => implode(', ', $item['country'] ?? []), + ]; + } + + return new RowsOfFields($data); + } + + /** + * Trigger sync. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:trigger-sync + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:trigger-sync --id=xxx + * List datasources. + */ + public function triggerSync( + array $options = [ + 'id' => 0, + ], + ) { + $datasources = $this->getDatasources(); + + $aws_options = reliefweb_semantic_get_aws_client_options(); + $bedrock = new BedrockAgentClient($aws_options); + + if (!empty($options['id'])) { + $bedrock->startIngestionJob([ + 'dataSourceId' => $datasources[$options['id']]['id'], + 'knowledgeBaseId' => $datasources[$options['id']]['kb_id'], + ]); + + return; + } + + foreach ($datasources as $id => $datasource) { + $bedrock->startIngestionJob([ + 'dataSourceId' => $id, + 'knowledgeBaseId' => $datasource['kb_id'], + ]); + + // Rate limit is 0.1 / sec. + sleep(15); + } + } + + /** + * Get kbs. + */ + protected function getKbs($reset = 0) : array { + $data = $this->state->get('reliefweb_semantic_kbs', []); + + if (empty($data) || !empty($reset)) { + $data = []; + $aws_options = reliefweb_semantic_get_aws_client_options(); + $bedrock = new BedrockAgentClient($aws_options); + $result = $bedrock->listKnowledgeBases(); + + $kbs = $result->toArray()['knowledgeBaseSummaries']; + foreach ($kbs as $kb) { + $data[$kb['knowledgeBaseId']] = [ + 'id' => $kb['knowledgeBaseId'], + 'name' => $kb['name'], + 'status' => $kb['status'], + ]; + } + + // Remove tests. + unset($data['FV9YWTCSHX']); + unset($data['2ZOGICT5IP']); + $this->state->set('reliefweb_semantic_kbs', $data); + } + + return $data; + } + + /** + * Get data sources. + */ + protected function getDatasources($reset = 0) { + $data = $this->state->get('reliefweb_semantic_datasources', []); + if (empty($data) || !empty($reset)) { + $kbs = $this->getKbs(); + $aws_options = reliefweb_semantic_get_aws_client_options(); + $bedrock = new BedrockAgentClient($aws_options); + + foreach ($kbs as $id => $kb) { + $result = $bedrock->listDataSources([ + 'knowledgeBaseId' => $id, + ]); + + $datasources = $result->toArray()['dataSourceSummaries']; + foreach ($datasources as $datasource) { + $data[$datasource['dataSourceId']] = [ + 'id' => $datasource['dataSourceId'], + 'name' => $datasource['name'], + 'status' => $datasource['status'], + 'kb_id' => $kb['id'], + 'kb_name' => $kb['name'], + 'kb_status' => $kb['status'], + ]; + } + } + + $this->state->set('reliefweb_semantic_datasources', $data); + } + + return $data; + } + + /** + * List API keys. + * + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:list-apikeys + * + * @option reset. + * + * @default $options [] + * + * @usage reliefweb-semantic:list-apikeys + * List datasources. + */ + public function listApikeys( + array $options = [ + 'reset' => 0, + 'format' => 'table', + ], + ) : RowsOfFields { + $data = $this->getApikeys($options['reset']); + return new RowsOfFields($data); + } + + /** + * Get data sources. + */ + protected function getApikeys($reset = 0) { + $data = $this->state->get('reliefweb_semantic_apikeys', []); + if (empty($data) || !empty($reset)) { + $data = []; + $aws_options = reliefweb_semantic_get_aws_client_options(); + $api = new ApiGatewayClient($aws_options); + + $result = $api->getApiKeys([ + 'includeValues' => TRUE, + ]); + + $apikeys = $result->toArray()['items']; + foreach ($apikeys as $apikey) { + $data[$apikey['id']] = [ + 'id' => $apikey['id'], + 'name' => $apikey['name'], + 'key' => $apikey['value'], + 'status' => $apikey['enabled'], + ]; + } + } + + $this->state->set('reliefweb_semantic_apikeys', $data); + + return $data; + } + +} diff --git a/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticCommands.php b/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticCommands.php new file mode 100644 index 000000000..5f2d4bd6c --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/src/Commands/ReliefWebSemanticCommands.php @@ -0,0 +1,608 @@ + [ + 'type' => 'node', + 'index' => 'rw-reports', + 'bucket' => 'rw-kb-reports', + 'field-list' => [ + 'nid' => 'id', + 'uuid' => 'uuid', + 'created' => 'created', + 'changed' => 'changed', + 'title' => 'title', + 'status' => 'status', + 'body' => 'body', + 'field_file' => 'files', + 'field_country' => 'country', + 'field_disaster' => 'disaster', + 'field_disaster_type' => 'disaster_type', + 'field_feature' => 'feature', + 'field_primary_country' => 'primary_country', + 'field_source' => 'source', + 'field_theme' => 'theme', + ], + ], + 'job' => [ + 'type' => 'node', + 'index' => 'rw-jobs', + 'bucket' => 'rw-kb-jobs', + 'field-list' => [ + 'nid' => 'id', + 'uuid' => 'uuid', + 'created' => 'created', + 'changed' => 'changed', + 'title' => 'title', + 'status' => 'status', + 'body' => 'body', + 'field_career_categories' => 'career_categories', + 'field_city' => 'city', + 'field_job_closing_date' => 'job_closing_date', + 'field_country' => 'country', + 'field_city' => 'city', + 'field_how_to_apply' => 'how_to_apply', + 'field_job_type' => 'job_type', + 'field_job_experience' => 'job_experience', + 'field_source' => 'source', + 'field_theme' => 'theme', + ], + ], + 'training' => [ + 'type' => 'node', + 'index' => 'rw-trainings-2', + 'bucket' => 'rw-kb-trainings', + 'field-list' => [ + 'nid' => 'id', + 'uuid' => 'uuid', + 'created' => 'created', + 'changed' => 'changed', + 'title' => 'title', + 'status' => 'status', + 'body' => 'body', + 'field_country' => 'country', + 'field_city' => 'city', + 'field_source' => 'source', + 'field_theme' => 'theme', + 'field_training_type' => 'training_type', + 'field_cost' => 'cost', + 'field_training_language' => 'training_language', + 'field_fee_information' => 'fee_information', + 'field_career_categories' => 'career_categories', + 'field_training_format' => 'training_format', + ], + ], + ]; + + /** + * {@inheritdoc} + */ + public function __construct( + ConfigFactoryInterface $config_factory, + EntityFieldManagerInterface $entity_field_manager, + EntityTypeManagerInterface $entity_type_manager, + ModuleHandlerInterface $module_handler, + StateInterface $state, + ClientInterface $http_client, + FileSystemInterface $file_system, + RendererInterface $renderer, + ) { + $this->config = $config_factory->get('reliefweb_semantic.settings'); + $this->entityFieldManager = $entity_field_manager; + $this->entityTypeManager = $entity_type_manager; + $this->moduleHandler = $module_handler; + $this->state = $state; + $this->httpClient = $http_client; + $this->fileSystem = $file_system; + $this->renderer = $renderer; + } + + /** + * Index content in the ReliefWeb API. + * + * @param string $bundle + * Entity bundle to index. + * @param array $options + * Additional options for the command. + * + * @command reliefweb-semantic:index + * + * @option limit Maximum number of entities to index, defaults to 0 (all). + * @option offset ID of the entity from which to start the indexing, defaults + * to the most recent one. + * + * @default $options [] + * + * @usage reliefweb-semantic:index --id=123 report + * Index the report with ID 123. + * @usage reliefweb-semantic:index --limit=10 report + * Index latest 10 reports. + */ + public function index( + $bundle = '', + array $options = [ + 'limit' => 10, + 'id' => 0, + ], + ) { + // Index all the resources. + if ($bundle === 'all') { + foreach ($this->bundles as $bundle => $info) { + $this->index($bundle, $options); + } + return; + } + + if (!empty($options['id'])) { + $this->processItem($bundle, $options['id']); + return; + } + + // Index the given bundles. + if (strpos($bundle, ',') > 0) { + $bundles = explode(',', $bundle); + foreach ($bundles as $bundle) { + if (isset($this->bundles[$bundle])) { + $this->index($bundle, $options); + } + } + return; + } + + // Index indexing options. + $limit = (int) ($options['limit'] ?: 10); + + // Launch the indexing or index removal. + try { + $num_items = $this->indexItems($bundle, $limit); + if ($num_items == 0) { + $this->logger->notice(strtr('Nothing left to index for @bundle', [ + '@bundle' => $bundle, + ])); + } + else { + $this->logger->notice(strtr('Indexed @num_items items for @bundle', [ + '@num_items' => $num_items, + '@bundle' => $bundle, + ])); + } + } + catch (\Exception $exception) { + if ($exception->getMessage() !== 'No entity to index.') { + $this->logger->error('(' . $exception->getCode() . ') ' . $exception->getMessage()); + } + else { + $this->logger->notice($exception->getMessage()); + } + } + } + + /** + * Index items. + */ + protected function indexItems($bundle, $limit = 10) : int { + $entity_type = $this->bundles[$bundle]['type'] ?? ''; + if (empty($entity_type)) { + $this->logger->notice(strtr('Unknown entity type for @bundle', [ + '@bundle' => $bundle, + ])); + } + + $key = 'nid'; + if ($entity_type == 'taxonomy_term') { + $key = 'tid'; + } + + $query = $this->entityTypeManager + ->getStorage($entity_type) + ->getQuery() + ->accessCheck(FALSE) + ->range(0, $limit) + ->sort($key, 'DESC') + ->condition($key, $this->state->get('reliefweb_semantic_last_indexed_' . $bundle, 0), '<'); + + if ($entity_type == 'node') { + $query->condition('type', $bundle); + } + elseif ($entity_type == 'taxonomy_term') { + $query->condition('vid', $bundle); + } + + $ids = $query->execute(); + if (empty($ids)) { + return 0; + } + + $entities = $this->entityTypeManager->getStorage($entity_type)->loadMultiple($ids); + $count = 0; + foreach ($entities as $id => $entity) { + $data = $this->prepareItem($entity); + if (empty($data)) { + continue; + } + + try { + $this->indexItem($bundle, $data); + + $this->state->set('reliefweb_semantic_last_indexed_' . $bundle, $id); + $count++; + } + catch (\Throwable $th) { + $this->logger->notice(strtr('Unable to index @id for @bundle', [ + '@id' => $data['id'], + '@bundle' => $bundle, + ])); + } + } + + return $count; + } + + /** + * Process an item. + */ + protected function processItem(string $bundle, string $id) { + $entity_type = $this->bundles[$bundle]['type'] ?? ''; + if (empty($entity_type)) { + $this->logger->notice('Unknown entity type for @bundle', [ + '@bundle' => $bundle, + ]); + } + + $entity = $this->entityTypeManager->getStorage($entity_type)->load($id); + if (empty($entity)) { + $this->logger->notice(strtr('Unable to load @id for @bundle', [ + '@id' => $id, + '@bundle' => $bundle, + ])); + + return; + } + + if ($entity->bundle() != $bundle) { + $this->logger->notice(strtr('Bundle @a found for @id instead of @b', [ + '@a' => $entity->bundle(), + '@id' => $id, + '@b' => $bundle, + ])); + + return; + } + + $data = $this->prepareItem($entity); + if (empty($data)) { + return FALSE; + } + + try { + $this->indexItem($bundle, $data); + return TRUE; + } + catch (\Throwable $th) { + $this->logger->notice(strtr('Unable to index @id for @bundle', [ + '@id' => $data['id'], + '@bundle' => $bundle, + ])); + } + + return FALSE; + } + + /** + * Prepare data for the index. + */ + protected function prepareItem(ContentEntityInterface $entity) : array { + $this->logger->notice(strtr('Preparing @bundle: @title (@id)', [ + '@bundle' => $entity->bundle(), + '@title' => $entity->label(), + '@id' => $entity->id(), + ])); + + $data = []; + $field_list = $this->bundles[$entity->bundle()]['field-list'] ?? []; + if (empty($field_list)) { + $this->logger->notice(strtr('No field list found for @bundle', [ + '@bundle' => $entity->bundle(), + ])); + + return []; + } + + $date = new \DateTime('now', new \DateTimeZone('UTC')); + $data['site'] = 'reliefweb'; + $data['timestamp'] = $date->format(\DateTime::ATOM); + $data['bundle'] = $entity->bundle(); + $data['nid'] = $entity->id(); + + /** @var \Drupal\node\NodeViewBuilder */ + $view_builder = $this->entityTypeManager->getViewBuilder('node'); + + if (!isset($data['html'])) { + $data['html'] = ''; + } + + foreach ($field_list as $field_name => $property_name) { + if (!$entity->hasField($field_name)) { + continue; + } + + $field_type = $entity->get($field_name)->getFieldDefinition()->getFieldStorageDefinition()->getType(); + switch ($field_type) { + case 'text_with_summary': + case 'text_long': + $data[$property_name] = $entity->get($field_name)->value; + + $build = $view_builder->viewField($entity->get($field_name), 'full'); + $data['html'] .= $this->renderer->renderPlain($build); + $data['html'] .= "\n\n"; + break; + + case 'entity_reference': + $data[$property_name] = []; + $as_string = []; + foreach ($entity->get($field_name)->referencedEntities() as $ref) { + $data[$property_name][] = (string) $ref->id(); + $as_string[] = $ref->label(); + } + + if (!empty($as_string)) { + $data['html'] .= '

' . $property_name . ': ' . implode(', ', $as_string) . '

'; + } + break; + + case 'datetime': + $date = new \DateTime($entity->get($field_name)->value, new \DateTimeZone('UTC')); + $item[$property_name] = $date->format(\DateTime::ATOM); + break; + + case 'reliefweb_file': + $data['files'] = []; + /** @var \Drupal\reliefweb_files\Plugin\Field\FieldType\ReliefWebFile $item */ + foreach ($entity->get($field_name) as $item) { + $data['files'][] = $item->loadFile()->getFileUri(); + } + break; + + default: + $data[$property_name] = $entity->get($field_name)->value; + } + } + + return $data; + } + + /** + * Index the index. + */ + protected function indexItem(string $bundle, array $data, array $files = []) { + $index = $this->bundles[$bundle]['index'] ?? []; + if (empty($index)) { + $this->logger->notice(strtr('No index found for @bundle', [ + '@bundle' => $bundle, + ])); + + return []; + } + + if (empty($files) && isset($data['files'])) { + $files = $data['files']; + unset($data['files']); + } + + // Dump title and html field into a PDF. + // @see https://docs.aws.amazon.com/bedrock/latest/userguide/kb-chunking-parsing.html#kb-advanced-parsing + if (!empty($data['html'])) { + $content = '

' . $data['title'] . '

' . "\n\n" . $data['html']; + $destination = 'temporary://' . $data['id'] . '.pdf'; + + $html2pdf = new Html2Pdf(); + $html2pdf->writeHTML($content); + $content = $html2pdf->output($data['id'] . '.pdf', 'S'); + + $this->fileSystem->saveData($content, $destination, FileSystemInterface::EXISTS_REPLACE); + $files[] = $destination; + } + + // Dump metadata. + if (isset($data['html'])) { + unset($data['html']); + } + if (isset($data['body'])) { + unset($data['body']); + } + + // Remove empty fields. + $data = array_filter($data); + + $content = json_encode([ + 'metadataAttributes' => $data, + ]); + $metadata_file = 'temporary://' . $data['id'] . '.pdf.metadata.json'; + $this->fileSystem->saveData($content, $metadata_file, FileSystemInterface::EXISTS_REPLACE); + + foreach ($files as $file) { + $absolute_path = $this->fileSystem->realpath($file); + if (!$absolute_path) { + $this->logger->notice(strtr('Unable to process @file for @id', [ + '@file' => $file, + '@id' => $data['id'], + ])); + continue; + } + + $this->sendToS3($this->bucket, $absolute_path); + $basename = basename($absolute_path); + $this->sendToS3($this->bucket, $metadata_file, $basename . '.metadata.json'); + } + } + + /** + * Store file and metadata on S3. + */ + protected function sendToS3(string $bucket, string $file_name, string $save_as = '') { + $client_options = reliefweb_semantic_get_aws_client_options(); + $client = new S3Client($client_options); + + if (empty($save_as)) { + $save_as = basename($file_name); + } + + $client->putObject([ + 'Bucket' => $bucket, + 'Key' => $save_as, + 'SourceFile' => $file_name, + ]); + } + + /** + * Perform a request against the AWS API. + * + * @param string $method + * Request method. + * @param string $endpoint + * Request endpoint. + * @param mixed|null $payload + * Optional payload (will be converted to JSON if no content type is + * provided). + * @param string|null $content_type + * Optional content type of the payload. If not defined it is assumed to be + * JSON. + * @param array $valid_status_codes + * List of valid status codes that should not be logged as errors. + * + * @return \Psr\Http\Message\ResponseInterface|null + * The response or NULL if the request was not successful. + */ + protected function request(string $method, string $endpoint, $payload = NULL, ?string $content_type = NULL, array $valid_status_codes = []): ?ResponseInterface { + $url = rtrim($this->config->get('url'), '/') . '/' . ltrim($endpoint, '/'); + $options = []; + + if (isset($payload)) { + if (empty($content_type)) { + $options['json'] = $payload; + } + else { + $options['body'] = $payload; + $options['headers']['Content-Type'] = $content_type; + } + } + + try { + /** @var \Psr\Http\Message\ResponseInterface $response */ + $response = $this->httpClient->request($method, $url, $options); + return $response; + } + catch (BadResponseException $exception) { + $response = $exception->getResponse(); + $status_code = $response->getStatusCode(); + if (!in_array($status_code, $valid_status_codes)) { + $this->logger()->error(strtr('@method request to @endpoint failed with @status error: @error', [ + '@method' => $method, + '@endpoint' => $endpoint, + '@status' => $status_code, + '@error' => $exception->getMessage(), + ])); + } + } + catch (\Exception $exception) { + $this->logger()->error(strtr('@method request to @endpoint failed with @status error: @error', [ + '@method' => $method, + '@endpoint' => $endpoint, + '@status' => $exception->getCode(), + '@error' => $exception->getMessage(), + ])); + } + + return NULL; + } + +} diff --git a/html/modules/custom/reliefweb_semantic/src/Services/ReliefWebSemanticService.php b/html/modules/custom/reliefweb_semantic/src/Services/ReliefWebSemanticService.php new file mode 100644 index 000000000..ef89a961a --- /dev/null +++ b/html/modules/custom/reliefweb_semantic/src/Services/ReliefWebSemanticService.php @@ -0,0 +1,119 @@ + [ + 'vectorSearchConfiguration' => [ + 'numberOfResults' => 10, + ], + ], + ]; + + if (!empty($filters)) { + if (count($filters) == 1) { + $key = reset(array_keys($filters)); + $value = reset($filters); + $kb_filter = [ + 'retrievalConfiguration' => [ + 'vectorSearchConfiguration' => [ + 'filter' => [ + 'in' => [ + 'key' => $key, + 'value' => explode(',', $value), + ], + ], + 'numberOfResults' => 10, + ], + ], + ]; + } + else { + $all_filters = []; + foreach ($filters as $key => $value) { + $all_filters[] = [ + 'in' => [ + 'key' => $key, + 'value' => explode(',', $value), + ], + ]; + } + + $kb_filter = [ + 'retrievalConfiguration' => [ + 'vectorSearchConfiguration' => [ + 'numberOfResults' => 10, + 'overrideSearchType' => 'HYBRID', + 'filter' => [ + 'andAll' => $all_filters, + ], + ], + ], + ]; + } + } + + $br_options = [ + 'knowledgeBaseId' => $id, + 'retrievalQuery' => [ + 'text' => $q, + ], + ] + $kb_filter; + + $result = $bedrock->retrieve($br_options); + + $result = $result->toArray()['retrievalResults'] ?? []; + $data = []; + + foreach ($result as $item) { + $data[$item['metadata']['nid']] = [ + 'id' => $item['metadata']['nid'], + 'title' => $item['metadata']['title'], + 'score' => $item['score'], + 'file' => $item['location']['s3Location']['uri'], + 'theme' => $item['metadata']['theme'] ?? [], + 'country' => $item['metadata']['country'] ?? [], + ]; + } + + return $data; + } + +}