Dataset:
Submitter | Filename | Virus
Lyn | 012345.fasta | abc/USA/abc-01234567/1234
Lyn | 012345.fasta | abc/USA/abc- 04567898/1234
Lyn | 012345.fasta | abc/USAabc- 78935421/1234
Files within a directory called 012345:
04567898-abc-a124-pol.fasta
78935421-abc-a124-pol.fasta
01234567-abc-a124-pol.fasta
Inside of a file:
&01234567-abc-a124-pol
ACTGATGATGAGATAGA
ATAGATAGATAGATAG
AGATAGATAGATA
Note: there is the same 8 digit number within Column C, a fasta file name, and the fasta file header.
What I want to do is take the entire string from Column C, and overwrite the current fasta file header. The output would look like this:
Fasta file name:
01234567-abc-a124-pol.fasta
Inside of a file:
&abc/USA/abc-01234567/1234
ACTGATGATGAGATAGA
ATAGATAGATAGATAG
AGATAGATAGATA
Here is a class I am trying to write to do that. I am struggling to get it to work. Please let me know if there is a simpler way.
def parse(file_name):
with zipfile.ZipFile(file_name, "r") as zf:
for name in zf.namelist():
if ".fasta" in str(name) and '.ipynb' not in str(name):
with io.TextIOWrapper(zf.open(name)) as file:
e = "".join([">{}
{}".format(str(title.split(None, 1)[0]), identifiers)
for title, identifiers in SimpleFastaParser(file)])
with open("test_dict.fasta", "a+") as f0:
f0.write('{}
'.format(e))
return file_name
def names(file_name):
with zipfile.ZipFile(file_name, "r") as zf:
e1 = []
for name in zf.namelist():
if ".fasta" in str(name) and '.ipynb' not in str(name):
with io.TextIOWrapper(zf.open(name)) as file:
e1 += [title.split(None, 1)[0]
for title, identifiers in SimpleFastaParser(file)]
print(e1)
return
def pandas(file_name):
with zipfile.ZipFile(file_name, "r") as zf:
e4 = []
for name in zf.namelist():
if ".fasta" in str(name) and '.ipynb' not in str(name):
with io.TextIOWrapper(zf.open(name)) as file:
e4 += [{title.split(None, 1)[0]:[identifiers, len("".join(identifiers))]
for title, identifiers in SimpleFastaParser(file)}]
dataset = pd.DataFrame({'Name': ["".join(i.keys()) for i in e4],
'Length':[list(i.values())[0][1] for i in e4],
'Sequence':[list(i.values())[0][0] for i in e4]})
return dataset
question from:
https://stackoverflow.com/questions/65833365/overwrite-header-of-fasta-file-with-a-column-value-by-matching-the-file-name-wi