mirror of
https://github.com/nlmatics/nlm-ingestor.git
synced 2024-08-02 20:58:47 +03:00
Merge branch 'main' into update-to-nlm-2.9.2_v2
This commit is contained in:
4
.github/workflows/docker-publish.yml
vendored
4
.github/workflows/docker-publish.yml
vendored
@@ -35,9 +35,9 @@ jobs:
|
||||
# https://github.com/sigstore/cosign-installer
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@6e04d228eb30da1757ee4e1dd75a0ec73a653e06 #v3.1.1
|
||||
uses: sigstore/cosign-installer@v3.5.0
|
||||
with:
|
||||
cosign-release: 'v2.1.1'
|
||||
cosign-release: 'v2.2.4'
|
||||
|
||||
# Setup QEMU for multi-platform build support
|
||||
# https://docs.docker.com/build/ci/github-actions/multi-platform/
|
||||
|
||||
34
Dockerfile
34
Dockerfile
@@ -4,26 +4,26 @@ RUN apt-get update && apt-get -y --no-install-recommends install libgomp1
|
||||
ENV APP_HOME /app
|
||||
# install Java
|
||||
RUN mkdir -p /usr/share/man/man1 && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y openjdk-17-jre-headless
|
||||
apt-get update -y && \
|
||||
apt-get install -y openjdk-17-jre-headless
|
||||
# install essential packages
|
||||
RUN apt-get install -y \
|
||||
libxml2-dev libxslt-dev \
|
||||
build-essential libmagic-dev
|
||||
libxml2-dev libxslt-dev \
|
||||
build-essential libmagic-dev
|
||||
# install tesseract
|
||||
RUN apt-get install -y \
|
||||
tesseract-ocr \
|
||||
lsb-release \
|
||||
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \
|
||||
&& apt-get update -oAcquire::AllowInsecureRepositories=true \
|
||||
&& apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
tesseract-ocr libtesseract-dev \
|
||||
&& wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
||||
tesseract-ocr \
|
||||
lsb-release \
|
||||
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \
|
||||
&& apt-get update -oAcquire::AllowInsecureRepositories=true \
|
||||
&& apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
tesseract-ocr libtesseract-dev \
|
||||
&& wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
||||
RUN apt-get install unzip -y && \
|
||||
apt-get install git -y && \
|
||||
apt-get autoremove -y
|
||||
apt-get install git -y && \
|
||||
apt-get autoremove -y
|
||||
WORKDIR ${APP_HOME}
|
||||
COPY . ./
|
||||
RUN pip install --upgrade pip setuptools
|
||||
@@ -32,5 +32,7 @@ RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
|
||||
RUN pip install -r requirements.txt
|
||||
RUN python -m nltk.downloader stopwords
|
||||
RUN python -m nltk.downloader punkt
|
||||
RUN python -c "import tiktoken; tiktoken.get_encoding(\"cl100k_base\")"
|
||||
RUN chmod +x run.sh
|
||||
CMD ./run.sh
|
||||
EXPOSE 5001
|
||||
CMD ./run.sh
|
||||
|
||||
@@ -32,7 +32,7 @@ There are two ways to process these types of documents
|
||||
1. Install latest version of java from https://www.oracle.com/java/technologies/downloads/
|
||||
2. Run the tika server:
|
||||
```
|
||||
java -jar <path_to_nlm_ingestor>/jars/tika-server-standard-nlm-modified-2.9.2_v2.jar
|
||||
java -jar <path_to_nlm_ingestor>/jars/tika-server-standard-nlm-modified-2.9.2_v2.jar
|
||||
```
|
||||
3. Install the ingestor
|
||||
```
|
||||
|
||||
BIN
jars/tika-server-standard-nlm-modified-2.9.2_v1.jar
Normal file
BIN
jars/tika-server-standard-nlm-modified-2.9.2_v1.jar
Normal file
Binary file not shown.
Reference in New Issue
Block a user