Suplementary Materials for ‘The speakers of minority languages are more multilingual’

Suplementary Materials

Table 1

Size of language population in Dagestan according to census 1926

read_csv("lect_census_1926_present_in_study.csv") %>% 
  DT::datatable()

Apendix 1

This table contains the list of the villages used in this paper with information about the lect spoken in a village, the branch or language family to which they belong, their glottocode, and the number of L1 speakers.

read_csv("data.csv") %>% 
  select(residence.en, lect, glottocode, branch, census_1926) %>% 
  distinct() %>% 
  DT::datatable()

Apendix 2

Random effects from the model:

decade

read_csv("decade_random_effects.csv") %>% 
  DT::datatable()

residence

read_csv("residence_random_effects.csv") %>% 
  DT::datatable()

Apendix 3

Raw data

read_csv("data.csv") %>% 
  DT::datatable()

Code for graphs and statistical modeling

library(tidyverse)
theme_set(theme_bw())
read_csv("data.csv") %>% 
  mutate(sex = ifelse(sex == "м", "m", sex),
         sex = ifelse(sex == "ж", "f", sex)) %>% 
  filter(!(is.na(birth)&!is.na(residence.en))) ->
  df

## For Table 1

df %>% 
  distinct(lect, census_1926, present_in_multidagestan) %>% 
  arrange(census_1926) %>% 
  write_csv("lect_census_1926_present_in_study.csv")

Figure 1

Correlation between the number of L2s spoken by each person in the database and the decade of birth, grouped by gender

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7")

df %>%
  filter(!is.na(sex)) %>% 
  mutate(birth = 1964) %>% 
  count(sex, sum_langs, birth) ->
  counts

df %>%
  filter(!is.na(sex)) %>% 
  ggplot(aes(sum_langs, birth))+
  ggbeeswarm::geom_beeswarm(size = 0.5, cex = 0.7, aes(color = factor(sum_langs)), show.legend = FALSE)+
  geom_label(data = counts, aes(sum_langs, birth, label = n))+
  coord_flip()+
  scale_x_continuous(breaks = 0:6)+
  scale_y_continuous(breaks = 0:6*10+1900)+
  scale_color_manual(values=cbPalette)+
  labs(x = "number of known L2", y = "year of birth")+
  facet_wrap(~sex)+
  theme(text = element_text(size = 18))

ggsave("FIG_1.png", device = "png", width = 10, height = 5)

Figure 2

Correlation of the number of L1 speakers and the number of L2s, colored by gender (female speakers offsetted up, male speakers offsetted down), faceted by decade, and overlaid with regression lines

df %>% 
  filter(!is.na(sex),
         !is.na(decade)) %>% 
  mutate(sum_langs_2 = ifelse(sex == "m", sum_langs-0.05, sum_langs+0.05)) %>% 
  ggplot(aes(census_1926, sum_langs, color = sex, fill = sex))+
  geom_point(alpha = 0, show.legend = FALSE)+ 
  geom_point(alpha = 0.5, show.legend = FALSE, aes(census_1926, sum_langs_2, color = sex, fill = sex))+ 
  facet_wrap(~decade, scales = "free_x")+
  geom_smooth(method = "glm", 
              method.args = list(family = poisson), 
              aes(group = sex), se = FALSE)+
  labs(x = "number of speakers",
       y = "number of L2")+
  scale_y_continuous(breaks = 0:6)+
  theme(legend.title = element_blank(),
        text = element_text(size = 12))+
  ggthemes::scale_fill_tableau(name=NULL)+
  ggthemes::scale_colour_tableau(name=NULL)+
  theme(legend.position = c(1, 1),
        legend.justification = c(1.1, 1.1))+
  scale_x_continuous(labels = scales::comma)

ggsave("FIG_2.png", device = "png", width = 9, height = 7)

Figure 3

Correlation of the number of L2 and L1 speakers of the same language, faceted by decade, and overlaid with linear regression lines

read_csv("for_fig_3.csv") %>% 
  group_by(decade) %>%
  mutate(cor = paste("τ =", round(cor(n, census_1926, method = "kendal"), 3))) %>% 
  ungroup() %>% 
  mutate(decade = str_c(decade, ", ", cor)) %>% 
  ggplot(aes(n, census_1926, label = lang_2_K))+
  geom_smooth(method = "lm", se = FALSE, linetype = 2)+
  geom_point()+
  facet_wrap(~decade, scales = "free")+
  labs(x = "number of L2 speakers (log scale)", 
       y = "population according the 1926 census (log scale)")+
  scale_y_log10(labels = scales::comma)+
  scale_x_log10(labels = scales::comma)+
  theme(text = element_text(size = 18))

ggsave("FIG_3.png", width = 12, height = 7)

library(lme4)
library(lmerTest)
df %>% 
  mutate(census_1926_modified = census_1926/10000,
         decade2 = decade/10-190) %>% 
  filter(!is.na(sum_langs)) ->
  df_reg

fit <- glmer(sum_langs~sex*census_1926_modified*decade2 + (1|residence.en), data = df_reg, family = "poisson")
performance::check_overdispersion(fit)

## # Overdispersion test
## 
##        dispersion ratio =    0.438
##   Pearson's Chi-Squared = 1762.084
##                 p-value =        1

Table 2

summary(fit)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: sum_langs ~ sex * census_1926_modified * decade2 + (1 | residence.en)
##    Data: df_reg
## 
##      AIC      BIC   logLik deviance df.resid 
##   8450.7   8507.4  -4216.3   8432.7     4023 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4541 -0.4448 -0.1142  0.3527  3.9149 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  residence.en (Intercept) 0.5625   0.75    
## Number of obs: 4032, groups:  residence.en, 54
## 
## Fixed effects:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -0.162451   0.140772  -1.154 0.248500    
## sexm                               0.391986   0.071094   5.514 3.51e-08 ***
## census_1926_modified              -0.107254   0.030774  -3.485 0.000492 ***
## decade2                           -0.020612   0.017638  -1.169 0.242570    
## sexm:census_1926_modified          0.034856   0.017978   1.939 0.052518 .  
## sexm:decade2                      -0.057721   0.023328  -2.474 0.013348 *  
## census_1926_modified:decade2      -0.004729   0.004948  -0.956 0.339210    
## sexm:census_1926_modified:decade2 -0.003119   0.006114  -0.510 0.609903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   cn_1926_ decad2 sx:_1926_ sxm:d2 c_1926_:
## sexm        -0.292                                                 
## cnss_1926_m -0.565  0.164                                          
## decade2     -0.333  0.652  0.189                                   
## sxm:c_1926_  0.144 -0.463 -0.388   -0.319                          
## sexm:decad2  0.247 -0.858 -0.141   -0.751  0.397                   
## cns_1926_:2  0.146 -0.288 -0.399   -0.450  0.684     0.339         
## sx:_1926_:2 -0.120  0.383  0.326    0.363 -0.829    -0.458 -0.808

data.frame(decade = rownames(ranef(fit)$residence.en),
           intercept = ranef(fit)$residence.en) %>% 
  write_csv("residence_random_effects.csv")

Figure 4

Estimated values of the number of L2s predicted for female and male speakers depending on the population size of the L1

df %>% 
  mutate(sex = ifelse(sex == "f", "female", "male")) %>% 
  distinct(lect, sex, census_1926, sum_langs) ->
  for_rug

library(ggeffects)
plot(ggpredict(fit, terms = c("census_1926_modified", "sex")))+
  labs(x = "number of L1 speakers", y = "number of L2", title = "")+
  scale_x_continuous(breaks = 0:5*2.5, labels = scales::comma(0:5*2.5*10000, accuracy = 1))+
  theme(legend.title = element_blank())+
  ggthemes::scale_fill_tableau(name=NULL)+
  ggthemes::scale_colour_tableau(name=NULL)+
  theme(legend.position = c(1, 1),
        legend.justification = c(1.1, 1.1),
        text = element_text(size = 18))

ggsave("FIG_4.png", device = "png", width = 9, height = 7)

Figure 5

Estimated values of the number of L2s predicted for female and male speakers depending on the size of the L1 community with extrapolation to larger language communities

plot(ggpredict(fit, terms = c("census_1926_modified [0:50]", "sex")))+
  labs(x = "number of L1 speakers", y = "number of L2", title = "")+
  scale_x_continuous(breaks = 0:5*10, labels = scales::comma(0:5*10*10000, accuracy = 1))+
  theme(legend.title = element_blank())+
  ggthemes::scale_fill_tableau(name=NULL)+
  ggthemes::scale_colour_tableau(name=NULL)+
  annotate(geom = "rug", x = for_rug$census_1926/10000)+
  annotate(geom = "text", x = 31.5, y = 1.25, label = "model extrapolation", size = 4)+
  annotate(geom = "segment", x = 14, xend = 49, y = 1.15, yend = 1.15, 
           arrow = arrow(length = unit(.3, 'cm'), type = "closed", ends = "both"))+
  annotate(geom = "text", x = 7, y = 0.15, label = "model\n interpolation", size = 4)+
  annotate(geom = "segment", x = 2.5, xend = 11.5, y = 0.03, yend = 0.03, 
           arrow = arrow(length = unit(.3, 'cm'), type = "closed", ends = "both"))+
  annotate(geom = "rect", xmin = 2, xmax = 12, ymin = -Inf, ymax = Inf, alpha = 0.06)+
  annotate(geom = "rect", xmin = 13, xmax = 50, ymin = -Inf, ymax = Inf, alpha = 0.06)+
  theme(legend.position = c(1, 1),
        legend.justification = c(1.5, 0.9),
        text = element_text(size = 18))

ggsave("FIG_5.png", device = "png", width = 9, height = 7)

Suplementary Materials for ‘The speakers of minority languages are more multilingual’

N. Dobrushina, G. Moroz