Skip to content Skip to sidebar Skip to footer

Data Manipulation, Kind Of Downsampling

I have a large csv file, example of the data below. I will use an example of eight teams to illustrate. home_team away_team home_score away_score year belgium

Solution 1:

Here is a solution using the tidyverse (dplyr and tidyr), in particular the pivot functions from tidyr...

library(tidyverse)df%>%pivot_longer(cols=-year,#splits non-year columns into home/away and type columnsnames_to=c("homeaway","type"),names_sep="_",values_to="value",values_ptypes=list(value=character()))%>%select(-homeaway)%>%#remove home/awaypivot_wider(names_from="type",#restore team and score columns (as list columns)values_from="value")%>%unnest(cols=c(team,score))%>%#unnest the list columns to year, team, scoregroup_by(year,team)%>%summarise(total_goals=sum(as.numeric(score)))# A tibble: 14 x 3# Groups:   year [2]yearteamtotal_goals<int><chr><dbl>11990 belgium421990 brazil331990 france241990 italy151990 mexico161990 sweden371990 uruguay181991 belgium291991 brazil2101991 chile3111991 england1121991 france3131991 italy1141991 switzerland2

Solution 2:

Here is yet another solution in R.

#Packages needed
library(dplyr)
library(magrittr)
library(tidyr)

#Your data
home_team <- c("belgium", "brazil", "italy", "sweden",
               "france", "brazil", "italy", "chile")
away_team <- c("france", "uruguay", "belgium", "mexico",
               "chile", "england", "belgium", "switzerland")
home_score <- c(2,3,1,3,
                3,2,1,2)
away_score <- c(2,1,2,1,
                1,1,2,2)
year <- c(1990, 1990, 1990, 1990,
          1991, 1991, 1991, 1991)

df <- data.frame(home_team, away_team, home_score, away_score, year, stringsAsFactors = FALSE)

df#   home_team   away_team home_score away_score year# 1   belgium      france          2          2 1990# 2    brazil     uruguay          3          1 1990# 3     italy     belgium          1          2 1990# 4    sweden      mexico          3          1 1990# 5    france       chile          3          1 1991# 6    brazil     england          2          1 1991# 7     italy     belgium          1          2 1991# 8     chile switzerland          2          2 1991#Column names for the new data.frames
my_colnames <- c("team", "score", "year")

#Using select() to create separate home and away datasets
df_home <- df %>% select(matches("home|year")) %>% setNames(my_colnames) %>% mutate(game_where = "home")
df_away <- df %>% select(matches("away|year")) %>% setNames(my_colnames) %>% mutate(game_where = "away")

#rbind()'ing both data.frames#Grouping the rows together first by the team and then by the year#Summing up the scores for the aforementioned groupings#Sorting the newly produced data.frame by year
df_1 <- rbind(df_home, df_away) %>% group_by(team, year) %>% tally(score) %>% arrange(year)

df_1 

 #   team         year     n#   <chr>       <dbl> <dbl># 1 belgium      1990     4# 2 brazil       1990     3# 3 france       1990     2# 4 italy        1990     1# 5 mexico       1990     1# 6 sweden       1990     3# 7 uruguay      1990     1# 8 belgium      1991     2# 9 brazil       1991     2#10 chile        1991     3#11 england      1991     1#12 france       1991     3#13 italy        1991     1#14 switzerland  1991     2

Solution 3:

You can try:

library(dplyr)

setNames(rbind(df[,c(1,3,5)], 
               setNames(df[,c(2,4,5)],names(df[,c(1,3,5)]))),c("Country","Goals","Year"))%>%
  group_by(Year, Country)%>% 
  summarize(Total =sum(Goals))#> # A tibble: 14 x 3#> # Groups:   Year [2]#>     Year Country     Total#>    <int> <chr>       <int>#>  1  1990 belgium         4#>  2  1990 brazil          3#>  3  1990 france          2#>  4  1990 italy           1#>  5  1990 mexico          1#>  6  1990 sweden          3#>  7  1990 uruguay         1#>  8  1991 belgium         2#>  9  1991 brazil          2#> 10  1991 chile           3#> 11  1991 england         1#> 12  1991 france          3#> 13  1991 italy           1#> 14  1991 switzerland     2

Solution 4:

Adding a solution that uses dplyr only.

library(dplyr)

 bind_rows(
   select(df, team = home_team, score = home_score, year),
   select(df, team = away_team, score = away_score, year)
 ) %>% 
   group_by(team, year) %>% 
   summarise(total_scores = sum(score))

Post a Comment for "Data Manipulation, Kind Of Downsampling"