Data Manipulation, Kind Of Downsampling
I have a large csv file, example of the data below. I will use an example of eight teams to illustrate. home_team away_team home_score away_score year belgium
Solution 1:
Here is a solution using the tidyverse
(dplyr
and tidyr
), in particular the pivot
functions from tidyr
...
library(tidyverse)df%>%pivot_longer(cols=-year,#splits non-year columns into home/away and type columnsnames_to=c("homeaway","type"),names_sep="_",values_to="value",values_ptypes=list(value=character()))%>%select(-homeaway)%>%#remove home/awaypivot_wider(names_from="type",#restore team and score columns (as list columns)values_from="value")%>%unnest(cols=c(team,score))%>%#unnest the list columns to year, team, scoregroup_by(year,team)%>%summarise(total_goals=sum(as.numeric(score)))# A tibble: 14 x 3# Groups: year [2]yearteamtotal_goals<int><chr><dbl>11990 belgium421990 brazil331990 france241990 italy151990 mexico161990 sweden371990 uruguay181991 belgium291991 brazil2101991 chile3111991 england1121991 france3131991 italy1141991 switzerland2
Solution 2:
Here is yet another solution in R.
#Packages needed
library(dplyr)
library(magrittr)
library(tidyr)
#Your data
home_team <- c("belgium", "brazil", "italy", "sweden",
"france", "brazil", "italy", "chile")
away_team <- c("france", "uruguay", "belgium", "mexico",
"chile", "england", "belgium", "switzerland")
home_score <- c(2,3,1,3,
3,2,1,2)
away_score <- c(2,1,2,1,
1,1,2,2)
year <- c(1990, 1990, 1990, 1990,
1991, 1991, 1991, 1991)
df <- data.frame(home_team, away_team, home_score, away_score, year, stringsAsFactors = FALSE)
df# home_team away_team home_score away_score year# 1 belgium france 2 2 1990# 2 brazil uruguay 3 1 1990# 3 italy belgium 1 2 1990# 4 sweden mexico 3 1 1990# 5 france chile 3 1 1991# 6 brazil england 2 1 1991# 7 italy belgium 1 2 1991# 8 chile switzerland 2 2 1991#Column names for the new data.frames
my_colnames <- c("team", "score", "year")
#Using select() to create separate home and away datasets
df_home <- df %>% select(matches("home|year")) %>% setNames(my_colnames) %>% mutate(game_where = "home")
df_away <- df %>% select(matches("away|year")) %>% setNames(my_colnames) %>% mutate(game_where = "away")
#rbind()'ing both data.frames#Grouping the rows together first by the team and then by the year#Summing up the scores for the aforementioned groupings#Sorting the newly produced data.frame by year
df_1 <- rbind(df_home, df_away) %>% group_by(team, year) %>% tally(score) %>% arrange(year)
df_1
# team year n# <chr> <dbl> <dbl># 1 belgium 1990 4# 2 brazil 1990 3# 3 france 1990 2# 4 italy 1990 1# 5 mexico 1990 1# 6 sweden 1990 3# 7 uruguay 1990 1# 8 belgium 1991 2# 9 brazil 1991 2#10 chile 1991 3#11 england 1991 1#12 france 1991 3#13 italy 1991 1#14 switzerland 1991 2
Solution 3:
You can try:
library(dplyr)
setNames(rbind(df[,c(1,3,5)],
setNames(df[,c(2,4,5)],names(df[,c(1,3,5)]))),c("Country","Goals","Year"))%>%
group_by(Year, Country)%>%
summarize(Total =sum(Goals))#> # A tibble: 14 x 3#> # Groups: Year [2]#> Year Country Total#> <int> <chr> <int>#> 1 1990 belgium 4#> 2 1990 brazil 3#> 3 1990 france 2#> 4 1990 italy 1#> 5 1990 mexico 1#> 6 1990 sweden 3#> 7 1990 uruguay 1#> 8 1991 belgium 2#> 9 1991 brazil 2#> 10 1991 chile 3#> 11 1991 england 1#> 12 1991 france 3#> 13 1991 italy 1#> 14 1991 switzerland 2
Solution 4:
Adding a solution that uses dplyr
only.
library(dplyr)
bind_rows(
select(df, team = home_team, score = home_score, year),
select(df, team = away_team, score = away_score, year)
) %>%
group_by(team, year) %>%
summarise(total_scores = sum(score))
Post a Comment for "Data Manipulation, Kind Of Downsampling"