Analyzing Telegram chats with R and Matlab

I am using Telegram a lot for chatting. Since it is possible to export the chat history with everyone you have been chatting with I wanted to create a word cloud featuring the most common words with my chat partners.

I use several tools for this. There is telegram-history-dump which itself uses telegram-cli for exporting the chat history. Then I use Matlab for some postprocessing (or preprocessing, depending on how you view it) to bring the dumped output in a (for me) manageable form. Lastly I use R to generate a word cloud like the one at the top of this post.

With the tutorial from telegram-history-dump it is relatively straight forward to dump all chat histories to your local disk in .json format. I therefore assume that you have the data in .json form. In the following parts I am going to explain the Matlab and R steps that I performed.

I am better with Matlab so I started with some basic analysis. I read the data and analyzed how many messages per day/week where sent, as well as the distribution of the message lengths. The chat partner’s name is replaced with other person. All source code is available at the end of this post. The three figures were generated with analyze.m

I am still not at a point where I can work with R. I use Matlab again to read all the .json file and write all strings in a single file that I can then read from R. I think the only reason to use R in the first place was because I want to learn it and the word cloud idea did not work for me. So have a look at towords.m for that process. It generates a file called words.txt which will be used by R.

The R code is then mostly copy pasted from a variety of website. It uses words.txt to generate a word cloud. Like in the preview. The code is pasted below, too.




function analzye

name = 'other_person';
s = jsonToStruct(strcat(name, '.jsonl'));

out = zeros(length(s), 1);
timestamp = zeros(length(s), 1);
media = zeros(length(s), 1);


for i = 1:length(s)
    out(i) = s{i}.out;
    timestamp(i) = s{i}.date;
    if isfield(s{i}, 'text')
        Text{i} = s{i}.text;
    else
        Text{i} = ''; % was media
        media(i) = 1;
    end
end

% remove media
out = out(not(media));
timestamp = timestamp(not(media)); % time in secs until now;
Text = Text(not(media))';

textout = Text(out == 1);
textin = Text(out == 0);


timestampSec = [];
timestampMin = [];
timestampHour = [];

for i = 1:length(s)
    
    if s{i}.out
        d = s{i}.from.when;
    else
        d = s{i}.to.when;
    end
    d = split(d, ' ');
    d = d{2};
    %     d = d{1}; % from cell to string
    d = split(d, ':');
    timestampSec = [timestampSec, str2num(d{3})];
    timestampMin = [timestampMin, str2num(d{2})];
    timestampHour = [timestampHour, str2num(d{1})];
    
end


for i = 1:length(textout)
    ntextout(i) = length(textout{i});
end

for i = 1:length(textin)
    ntextin(i) = length(textin{i});
end
% format timestamp, such that
% timestamp(1) is the first message between the two participants and the
% entry is the number of days that have passed since then to the date
% specified in the first line in the comment minus 13h38min
% timestamp(end) is the last chat
timestamp = timestamp-min(timestamp);
timestamp = timestamp/(60*60); % to hours
timestamp = timestamp-13.633; % set zero point to last midnight
timestamp = timestamp/24; % to days
ts = timestamp;

maxdays = 1:ceil(max(ts));
% count messages per day
for i = 1:length(maxdays)
    t = (ts > i-1 & ts <= i);
    msgperday(i) = sum(t);
    msgperdayout(i) = sum(t.*out);
end
maxweeks = 1:ceil(max(maxdays)/7);
for i = 1:length(maxweeks)
    t = (ts > 7*(i-1) & ts <= 7*i);
    msgperweek(i) = sum(t);
    msgperweekout(i) = sum(t.*out);
end


% count messages per day from thomas

close all
figure(1)
subplot(2, 2, 1:2)
bar(msgperday)
hold on
bar(msgperdayout)
colormap([1, 0, 0; ... %// red
    0, 0, 1; ... %// blue
    ])
legend('other person', 'Thomas')
xlabel('#days since first contact')
ylabel('#messages per day')
title('message history')
subplot(2, 2, 3:4)
bar(msgperweek)
hold on
bar(msgperweekout)
colormap([1, 0, 0; ... %// red
    0, 0, 1; ... %// blue
    ])
legend('other person', 'Thomas')
xlabel('#weeks since first contact')
ylabel('#messages per week')
% title('Telegram communication')
figure
subplot(1, 2, 1)
pie([sum(not(out)), sum(out)])
legend('other person', 'Thomas')
colormap([0, 0, 1; ...
    1, 0, 0; ...
    ])
title('#messages')

subplot(1, 2, 2)
pie([sum(ntextin), sum(ntextout)])
legend('other person', 'Thomas')
colormap([0, 0, 1; ...
    1, 0, 0; ...
    ])
title('#chars')


figure
[n, x] = histcounts(ntextin, 300);
barh(x(1:end-1), -n)
m = max(n);
[n, x] = histcounts(ntextout, x);
m = max(m, max(n));
hold on
barh(x(1:end-1), n)
legend('other person', 'Thomas')
ylabel('#chars per message')
xlabel('#messages with that number of chars');
axis([-1.1*m, 1.1*m, 0, 200]);
title('message length distribution, cutoff at y=200')

save;

end

function towords
load matlab;

tthomas = textout;


nt = length(tthomas);
tmp = {' '};
whites = repmat(tmp, 1, nt);

tthomaswhite = cell(1, 2*nt);

for k = 1:nt
    tthomaswhite{1+(k-1)*2} = tthomas{k};
    tthomaswhite{k*2} = ' ';
end

T = strcat(tthomaswhite{:});
% T = strrep(T,')',') ');
% T = strrep(T,'(',' (');
T = strrep(T, '*', '* ');


T = strrep(T, '?', ' ');
T = strrep(T, '!', ' ');
T = strrep(T, '.', ' ');
T = lower(T);

W = split(T, ' ');
% W = W{not(isempty(W{:}))};
W = W(~cellfun('isempty', W));
fid = fopen('words.txt', 'w');
CT = W.';
fprintf(fid, '%s\n', CT{:});
fclose(fid)


end

library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)

text <- readLines('wordstb.txt')
docs <- Corpus(VectorSource(text))
inspect(docs)

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("german"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("dass", "mal","schon","gerade","just")) 
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
#docs <- tm_map(docs, stemDocument)



dtm <- TermDocumentMatrix(docs)
dtm <- removeSparseTerms(dtm,sparse = 0.9999)

m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 20)

set.seed(12334)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.45, 
          colors=brewer.pal(8, "Dark2"))

findFreqTerms(dtm, lowfreq = 4)
findAssocs(dtm, terms = "freedom", corlimit = 0.3)
head(d, 10)
barplot(d[1:20,]$freq, las = 2, names.arg = d[1:20,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")



Leave a Reply