|
|
import re |
|
|
import pandas as pd |
|
|
|
|
|
import dateutil.parser |
|
|
|
|
|
def preprocess(data): |
|
|
messages = [] |
|
|
dates = [] |
|
|
for line in data.splitlines(): |
|
|
try: |
|
|
date_obj = dateutil.parser.parse(line.split(',')[0]) |
|
|
messages.append(line.split(',')[1]) |
|
|
dates.append(date_obj) |
|
|
except (ValueError, IndexError): |
|
|
pass |
|
|
df = pd.DataFrame({'user_message': messages, 'date': dates}) |
|
|
return df |
|
|
|
|
|
|
|
|
users = [] |
|
|
messages = [] |
|
|
for message in df['user_message']: |
|
|
entry = re.split('([\w\W]+?):\s', message) |
|
|
if entry[1:]: |
|
|
users.append(entry[1]) |
|
|
messages.append(" ".join(entry[2:])) |
|
|
else: |
|
|
users.append('group_notification') |
|
|
messages.append(entry[0]) |
|
|
|
|
|
df['user'] = users |
|
|
df['message'] = messages |
|
|
df.drop(columns=['user_message'], inplace=True) |
|
|
|
|
|
df['only_date'] = df['date'].dt.date |
|
|
df['year'] = df['date'].dt.year |
|
|
df['month_num'] = df['date'].dt.month |
|
|
df['month'] = df['date'].dt.month_name() |
|
|
df['day'] = df['date'].dt.day |
|
|
df['day_name'] = df['date'].dt.day_name() |
|
|
df['hour'] = df['date'].dt.hour |
|
|
df['minute'] = df['date'].dt.minute |
|
|
|
|
|
period = [] |
|
|
for hour in df[['day_name', 'hour']]['hour']: |
|
|
if hour == 23: |
|
|
period.append(str(hour) + "-" + str('00')) |
|
|
elif hour == 0: |
|
|
period.append(str('00') + "-" + str(hour + 1)) |
|
|
else: |
|
|
period.append(str(hour) + "-" + str(hour + 1)) |
|
|
|
|
|
df['period'] = period |
|
|
|
|
|
return df |