From ef33864adfd5971073e562c92d0771dc9b855fd6 Mon Sep 17 00:00:00 2001 From: Andy Grunwald Date: Thu, 6 Feb 2025 18:40:21 +0100 Subject: [PATCH] New Script: Apache Tika (#2079) * New Script: Apache Tika * Temp: Replace github URLs to my own fork * Add additional dependencies according to the Docker image installation See https://github.com/apache/tika-docker/blob/master/full/Dockerfile * Apache Tika: Set correct tags * Apache Tika: Set TODO to make it updateable * Apache Tika: Fix "software-properties-common: command not found" * Apache Tika: Automate version detection * Apache Tika: Add `update_script` * Apache Tika: Added clean up of `/opt/apache-tika/tika-server-standard-prev-version.jar` after upgrade * Apache Tika: Bump up ram to 2048 * Apache Tika: Set updateable to true * Apache Tika: Switch from `default-jdk` to `openjdk-17-jre-headless` * Apache Tika: Removed comment about Docker file * Apache Tika: Removed empty line * Revert "Temp: Replace github URLs to my own fork" This reverts commit f1c5d8720696cdbfde9471abfff07e4b7b71bc6d. --- ct/apache-tika.sh | 69 ++++++++++++++++++++++++++++++ install/apache-tika-install.sh | 78 ++++++++++++++++++++++++++++++++++ json/apache-tika.json | 34 +++++++++++++++ 3 files changed, 181 insertions(+) create mode 100755 ct/apache-tika.sh create mode 100644 install/apache-tika-install.sh create mode 100644 json/apache-tika.json diff --git a/ct/apache-tika.sh b/ct/apache-tika.sh new file mode 100755 index 00000000..6227de95 --- /dev/null +++ b/ct/apache-tika.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +source <(curl -s https://raw.githubusercontent.com/community-scripts/ProxmoxVE/main/misc/build.func) +# Copyright (c) 2021-2025 community-scripts ORG +# Author: Andy Grunwald (andygrunwald) +# License: MIT | https://github.com/community-scripts/ProxmoxVE/raw/main/LICENSE +# Source: https://github.com/apache/tika/ + +# App Default Values +APP="Apache-Tika" +var_tags="document" +var_cpu="1" +var_ram="2048" +var_disk="10" +var_os="debian" +var_version="12" +var_unprivileged="1" + +# App Output & Base Settings +header_info "$APP" +base_settings + +# Core +variables +color +catch_errors + +function update_script() { + header_info + check_container_storage + check_container_resources + if [[ ! -f /etc/systemd/system/apache-tika.service ]]; then + msg_error "No ${APP} Installation Found!" + exit + fi + RELEASE="$(wget -qO- https://dlcdn.apache.org/tika/ | grep -oP '(?<=href=")[0-9]+\.[0-9]+\.[0-9]+(?=/")' | sort -V | tail -n1)" + if [[ ! -f /opt/${APP}_version.txt ]] || [[ "${RELEASE}" != "$(cat /opt/${APP}_version.txt)" ]]; then + msg_info "Stopping ${APP}" + systemctl stop apache-tika + msg_ok "Stopped ${APP}" + + msg_info "Updating ${APP} to v${RELEASE}" + cd /opt/apache-tika + wget -q "https://dlcdn.apache.org/tika/${RELEASE}/tika-server-standard-${RELEASE}.jar" + mv --force tika-server-standard.jar tika-server-standard-prev-version.jar + mv tika-server-standard-${RELEASE}.jar tika-server-standard.jar + echo "${RELEASE}" >/opt/${APP}_version.txt + msg_ok "Updated ${APP} to v${RELEASE}" + + msg_info "Starting ${APP}" + systemctl start apache-tika + msg_ok "Started ${APP}" + msg_info "Cleaning Up" + rm -rf /opt/apache-tika/tika-server-standard-prev-version.jar + msg_ok "Cleanup Completed" + msg_ok "Updated Successfully" + else + msg_ok "No update required. ${APP} is already at v${RELEASE}" + fi + exit +} + +start +build_container +description + +msg_ok "Completed Successfully!\n" +echo -e "${CREATING}${GN}${APP} setup has been successfully initialized!${CL}" +echo -e "${INFO}${YW} Access it using the following URL:${CL}" +echo -e "${TAB}${GATEWAY}${BGN}http://${IP}:9998${CL}" \ No newline at end of file diff --git a/install/apache-tika-install.sh b/install/apache-tika-install.sh new file mode 100644 index 00000000..1976e60b --- /dev/null +++ b/install/apache-tika-install.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +# Copyright (c) 2021-2025 community-scripts ORG +# Author: Andy Grunwald (andygrunwald) +# License: MIT | https://github.com/community-scripts/ProxmoxVE/raw/main/LICENSE +# Source: https://github.com/apache/tika/ + +source /dev/stdin <<< "$FUNCTIONS_FILE_PATH" +color +verb_ip6 +catch_errors +setting_up_container +network_check +update_os + +msg_info "Installing Dependencies" +$STD apt-get install -y \ + curl \ + sudo \ + mc \ + software-properties-common \ + gdal-bin \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu +$STD echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections +$STD apt-get install -y \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + cabextract +msg_ok "Installed Dependencies" + +msg_info "Setup OpenJDK" +$STD apt-get install -y \ + openjdk-17-jre-headless +msg_ok "Setup OpenJDK" + +msg_info "Installing Apache Tika" +mkdir -p /opt/apache-tika +cd /opt/apache-tika +RELEASE="$(wget -qO- https://dlcdn.apache.org/tika/ | grep -oP '(?<=href=")[0-9]+\.[0-9]+\.[0-9]+(?=/")' | sort -V | tail -n1)" +wget -q "https://dlcdn.apache.org/tika/${RELEASE}/tika-server-standard-${RELEASE}.jar" +mv tika-server-standard-${RELEASE}.jar tika-server-standard.jar +echo "${RELEASE}" >/opt/${APPLICATION}_version.txt +msg_ok "Installed Apache Tika" + +msg_info "Creating Service" +cat </etc/systemd/system/apache-tika.service +[Unit] +Description=Apache Tika +Documentation=https://tika.apache.org/ +After=syslog.target network.target + +[Service] +User=root +Restart=always +Type=simple +ExecStart=java -jar /opt/apache-tika/tika-server-standard.jar --host 0.0.0.0 --port 9998 +ExecReload=/bin/kill -HUP \$MAINPID + +[Install] +WantedBy=multi-user.target +EOF +systemctl enable -q --now apache-tika +msg_ok "Created Service" + +motd_ssh +customize + +msg_info "Cleaning up" +$STD apt-get -y autoremove +$STD apt-get -y autoclean +msg_ok "Cleaned" diff --git a/json/apache-tika.json b/json/apache-tika.json new file mode 100644 index 00000000..1e891727 --- /dev/null +++ b/json/apache-tika.json @@ -0,0 +1,34 @@ +{ + "name": "Apache Tika", + "slug": "apache-tika", + "categories": [ + 12 + ], + "date_created": "2025-02-05", + "type": "ct", + "updateable": true, + "privileged": false, + "interface_port": 9998, + "documentation": null, + "website": "https://tika.apache.org/", + "logo": "https://tika.apache.org/tika.png", + "description": "The Apache Tika™ toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF). All of these file types can be parsed through a single interface, making Tika useful for search engine indexing, content analysis, translation, and much more.", + "install_methods": [ + { + "type": "default", + "script": "ct/apache-tika.sh", + "resources": { + "cpu": 1, + "ram": 2024, + "hdd": 10, + "os": "debian", + "version": "12" + } + } + ], + "default_credentials": { + "username": null, + "password": null + }, + "notes": [] +} \ No newline at end of file