I generate several .txt
files containing text stats inside a docker container on a server that uses a GPU. After the process is finished after 25h, I can't copy some of the generated files out of the container for later usage, while other generated files can be copied.
I might do something wrong while generating the files. Here is the code
from collections import Counter, defaultdict
import zlib
import re
import numpy as np
import string
import binascii
from tqdm import tqdm
import stanza
STOP_WORDS = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"])
stanza.download("en")
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
for filename in ["dataset.csv"]:
documents = []
# ...
# Code here that generates and stores a document
# that can be copied out
# ...
i = 0
token_to_id = {}
with open(f"{basename}_token_to_id_lemmatized.txt", "w") as f,
open(f"{basename}_token_to_docs_lemmatized.txt", "w") as f2,
open(f"{basename}_token_to_freqs_lemmatized.txt", "w") as f3:
# trick to reduce memory-in-use
for max_range in [(0, 1000), (1000, 2000), (2000, 3000),
(3000, 10000), (10000, 20000), (20000, 1000000)]:
token_to_docs = defaultdict(list)
for doc_id, doc in enumerate(tqdm(documents)):
for token, num_token in Counter(doc.split("|")).items():
if not token or token not in dictionary:
continue
if not token_to_id.get(token):
token_to_id[token] = i
f.write(f"{token}
")
i += 1
token_id = token_to_id[token]
if max_range[0] <= token_id < max_range[1]:
token_to_docs[token_id].append((doc_id, num_token))
for token_id in tqdm(range(max_range[0], max_range[1])):
for doc_id, num_token in token_to_docs[token_id]:
f2.write(f"{doc_id},")
f3.write(f"{num_token},")
if token_to_docs[token_id]:
f2.write("
")
f3.write("
")
I tried to copy out of the container in the console with:
docker cp container_id:/container_workdir/only_premise_dataset_token_to_id_lemmatized.txt /destination
Here is my Dockerfile:
FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04
RUN apt-get update && apt-get install -y python3 python3-pip git build-essential libssl-dev libffi-dev #libcupti-dev
WORKDIR /container_workdir
COPY requirements.txt ./
RUN pip3 install --upgrade pip
RUN pip3 install --upgrade setuptools
RUN pip install -r requirements.txt
COPY . .
ENV CUDA_VISIBLE_DEVICES=7
RUN export CUDA_VISIBLE_DEVICES=7
CMD bash
Note that I also tried re-storing the files as .csv
and .tsv
and also with pickle
inside the container. None of these approaches worked either.
EDIT as @Minarth stated, I get an error:
Error: No such container:path: container_id:/container_workdir/only_premise_dataset_token_to_id_lemmatized.txt