Update 2019 (PEG parser):
This answer has received quite some attention so I felt to add another possibility, namely a parsing option. Here we could use a PEG
parser instead (e.g. parsimonious
) in combination with a NodeVisitor
class:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import pandas as pd
grammar = Grammar(
r"""
schools = (school_block / ws)+
school_block = school_header ws grade_block+
grade_block = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws?
school_header = ~"^School = (.*)"m
grade_header = ~"^Grade = (d+)"m
name_header = "Student number, Name"
score_header = "Student number, Score"
number_name = index comma name ws
number_score = index comma score ws
comma = ws? "," ws?
index = number+
score = number+
number = ~"d+"
name = ~"[A-Z]w+"
ws = ~"s*"
"""
)
tree = grammar.parse(data)
class SchoolVisitor(NodeVisitor):
output, names = ([], [])
current_school, current_grade = None, None
def _getName(self, idx):
for index, name in self.names:
if index == idx:
return name
def generic_visit(self, node, visited_children):
return node.text or visited_children
def visit_school_header(self, node, children):
self.current_school = node.match.group(1)
def visit_grade_header(self, node, children):
self.current_grade = node.match.group(1)
self.names = []
def visit_number_name(self, node, children):
index, name = None, None
for child in node.children:
if child.expr.name == 'name':
name = child.text
elif child.expr.name == 'index':
index = child.text
self.names.append((index, name))
def visit_number_score(self, node, children):
index, score = None, None
for child in node.children:
if child.expr.name == 'index':
index = child.text
elif child.expr.name == 'score':
score = child.text
name = self._getName(index)
# build the entire entry
entry = (self.current_school, self.current_grade, index, name, score)
self.output.append(entry)
sv = SchoolVisitor()
sv.visit(tree)
df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)
Regex option (original answer)
Well then, watching Lord of the Rings the xth time, I had to bridge some time to the very finale:
Broken down, the idea is to split the problem up into several smaller problems:
- Separate each school
- ... each grade
- ... student and scores
- ... bind them together in a dataframe afterwards
The school part (see
a demo on regex101.com)
^
Schools*=s*(?P<school_name>.+)
(?P<school_content>[sS]+?)
(?=^School|)
The grade part (
another demo on regex101.com)
^
Grades*=s*(?P<grade>.+)
(?P<students>[sS]+?)
(?=^Grade|)
The student/score part (
last demo on regex101.com):
^
Student number, Name[
]
(?P<student_names>(?:^d+.+[
])+)
s*
^
Student number, Score[
]
(?P<student_scores>(?:^d+.+[
])+)
The rest is a generator expression which is then fed into the DataFrame
constructor (along with the column names).
The code:
import pandas as pd, re
rx_school = re.compile(r'''
^
Schools*=s*(?P<school_name>.+)
(?P<school_content>[sS]+?)
(?=^School|)
''', re.MULTILINE | re.VERBOSE)
rx_grade = re.compile(r'''
^
Grades*=s*(?P<grade>.+)
(?P<students>[sS]+?)
(?=^Grade|)
''', re.MULTILINE | re.VERBOSE)
rx_student_score = re.compile(r'''
^
Student number, Name[
]
(?P<student_names>(?:^d+.+[
])+)
s*
^
Student number, Score[
]
(?P<student_scores>(?:^d+.+[
])+)
''', re.MULTILINE | re.VERBOSE)
result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)
for school in rx_school.finditer(string)
for grade in rx_grade.finditer(school.group('school_content'))
for student_score in rx_student_score.finditer(grade.group('students'))
for student in zip(student_score.group('student_names')[:-1].split("
"), student_score.group('student_scores')[:-1].split("
"))
for student_number in [student[0].split(", ")[0]]
for name in [student[0].split(", ")[1]]
for score in [student[1].split(", ")[1]]
)
df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)
Condensed:
rx_school = re.compile(r'^Schools*=s*(?P<school_name>.+)(?P<school_content>[sS]+?)(?=^School|)', re.MULTILINE)
rx_grade = re.compile(r'^Grades*=s*(?P<grade>.+)(?P<students>[sS]+?)(?=^Grade|)', re.MULTILINE)
rx_student_score = re.compile(r'^Student number, Name[
](?P<student_names>(?:^d+.+[
])+)s*^Student number, Score[
](?P<student_scores>(?:^d+.+[
])+)', re.MULTILINE)
This yields
School Grade Student number Name Score
0 Riverdale High 1 0 Phoebe 3
1 Riverdale High 1 1 Rachel 7
2 Riverdale High 2 0 Angela 6
3 Riverdale High 2 1 Tristan 3
4 Riverdale High 2 2 Aurora 9
5 Hogwarts 1 0 Ginny 8
6 Hogwarts 1 1 Luna 7
7 Hogwarts 2 0 Harry 5
8 Hogwarts 2 1 Hermione 10
9 Hogwarts 3 0 Fred 0
10 Hogwarts 3 1 George 0
As for
timing, this is the result running it a ten thousand times:
import timeit
print(timeit.timeit(makedf, number=10**4))
# 11.918397722000009 s