src/content/publications/pubs2018.bib

@article{Ali2018,
   abstract = {Learning representation from audio data has shown advantages over the handcrafted features such as mel-frequency cepstral coefficients (MFCCs) in many audio applications. In most of the representation learning approaches, the connectionist systems have been used to learn and extract latent features from the fixed length data. In this paper, we propose an approach to combine the learned features and the MFCC features for speaker recognition task, which can be applied to audio scripts of different lengths. In particular, we study the use of features from different levels of deep belief network for quantizing the audio data into vectors of audio word counts. These vectors represent the audio scripts of different lengths that make them easier to train a classifier. We show in the experiment that the audio word count vectors generated from mixture of DBN features at different layers give better performance than the MFCC features. We also can achieve further improvement by combining the audio word count vector and the MFCC features.},
   author = {H Ali and S N Tran and E Benetos and A S d'Avila Garcez},
   doi = {10.1007/s00521-016-2501-7},
   issn = {0941-0643},
   issue = {6},
   journal = {Neural Computing and Applications},
   month = {3},
   pages = {13-19},
   publisher = {Springer Verlag (Germany)},
   title = {Speaker recognition with hybrid features from a deep belief network},
   volume = {29},
   url = {http://link.springer.com/article/10.1007/s00521-016-2501-7},
   year = {2018},
}
@inproceedings{Allik2018,
   abstract = {© 2018 IW3C2 (International World Wide Web Conference Committee), published under Creative Commons CC BY 4.0 License. MusicLynx is a web application for music discovery that enables users to explore an artist similarity graph constructed by linking together various open public data sources. It provides a multifaceted browsing platform that strives for an alternative, graph-based representation of artist connections to the grid-like conventions of traditional recommendation systems. Bipartite graph filtering of the Linked Data cloud, content-based music information retrieval, machine learning on crowd-sourced information and Semantic Web technologies are combined to analyze existing and create new categories of music artists through which they are connected. The categories can uncover similarities between artists who otherwise may not be immediately associated: for example, they may share ethnic background or nationality, common musical style or be signed to the same record label, come from the same geographic origin, share a fate or an affliction, or have made similar lifestyle choices. They may also prefer similar musical keys, instrumentation, rhythmic attributes, or even moods their music evokes. This demonstration is primarily meant to showcase the graph-based artist discovery interface of MusicLynx: how artists are connected through various categories, how the different graph filtering methods affect the topology and geometry of linked artists graphs, and ways in which users can connect to external services for additional content and information about objects of their interest.},
   author = {A Allik and F Thalmann and M Sandler},
   doi = {10.1145/3184558.3186970},
   isbn = {9781450356404},
   journal = {The Web Conference 2018 - Companion of the World Wide Web Conference, WWW 2018},
   month = {4},
   pages = {167-170},
   title = {MusicLynx: Exploring Music Through Artist Similarity Graphs},
   year = {2018},
}
@inproceedings{ARMITAGE2018,
   author = {J D K ARMITAGE and A MCPHERSON},
   month = {6},
  journal = {Proc. of the New Interfaces for Musical Expression (NIME)},
   title = {Crafting Digital Musical Instruments: An Exploratory Workshop Study},
   year = {2018},
}
@article{Baker2018,
   abstract = {© 2018 Intellect Ltd Project Review. Wearable technologies are a nascent market, growing exponentially and moving into our everyday lives more and more. They are exciting in their capacity to break down barriers between artists and designers and digital technology companies. Technology is becoming more efficient, accurate, and personalized. Hardware is becoming smaller, less visible, more connected and the collected data more seamless and ubiquitous. At the core of the wearable technology concerns is the amount of data that electronics companies are allowed to collect, in particular of their users’ personal data. Numerous technology companies and start-ups are working to make the next wearable device or application for body data tracking. This article provides a first critical analysis of the selection and monitoring processes used in these Open Calls and reports on initial work of the WEAR Sustain network, which for eighteen months has operated as a Pan-European catalyst for 46 projects in wearable technology design and development to the point of market and investment readiness, and discussing next steps for its Sustainability Toolkit for lasting impact.},
   author = {C Baker and H Ranaivoson and B Greinke and N Bryan-Kinns},
   doi = {10.1386/vcr.8.1.91_1},
   issue = {1},
   journal = {Virtual Creativity},
   month = {6},
   pages = {91-105},
   title = {Wear: Wearable technologists engage with artists for responsible innovation: Processes and progress},
   volume = {8},
   year = {2018},
}
@inproceedings{BEAR2018,
   abstract = {We present a new extensible and divisible taxonomy for open set sound scene analysis. This new model allows complex scene analysis with tangible descriptors and perception labels. Its novel structure is a cluster graph such that each cluster (or subset) can stand alone for targeted analyses such as office sound event detection, whilst maintaining integrity over the whole graph (superset) of labels. The key design benefit is its extensibility as new labels are needed during new data capture. Furthermore, datasets which use the same taxonomy are easily augmented, saving future data collection effort. We balance the details needed for complex scene analysis with avoiding 'the taxonomy of everything' with our framework to ensure no duplicity in the superset of labels and demonstrate this with DCASE challenge classifications.},
   author = {H BEAR and E BENETOS},
   journal = {http://dcase.community/workshop2018/},
   month = {11},
   title = {An extensible cluster-graph taxonomy for open set sound scene analysis},
   url = {http://dcase.community/workshop2018/},
   year = {2018},
}
@inproceedings{Bechhofer2018,
   author = {S Bechhofer and G Fazekas and K Page},
   isbn = {9781450364959},
   journal = {ACM International Conference Proceeding Series},
   month = {10},
   title = {Preface: ACM International Conference Proceeding Series},
   year = {2018},
}
@book_section{BENETOS2018,
   author = {E BENETOS and D STOWELL and M PLUMBLEY},
   doi = {10.1007/978-3-319-63450-0},
   edition = {1},
   editor = {T Virtanen and M PLUMBLEY and D Ellis},
   isbn = {978-3-319-63449-4},
   issue = {8},
   journal = {Computational Analysis of Sound Scenes and Events},
   month = {1},
   pages = {215-242},
   publisher = {Springer International Publishing},
   title = {Approaches to complex sound scene analysis},
   url = {http://www.springer.com/gb/book/9783319634494},
   year = {2018},
}
@article{Bengler2018,
   author = {B Bengler and F Martin and N Bryan-Kinns},
   doi = {10.1145/3183349},
   issn = {1072-5520},
   issue = {2},
   journal = {Interactions},
   month = {2},
   pages = {12-13},
   title = {Collidoscope},
   volume = {25},
   year = {2018},
}
@inproceedings{BIN2018,
   abstract = {This paper presents a study examining the effects of disflu- ent design on audience perception of digital musical instru- ment (DMI) performance. Disfluency, defined as a barrier to effortless cognitive processing, has been shown to gen- erate better results in some contexts as it engages higher levels of cognition. We were motivated to determine if dis- fluent design in a DMI would result in a risk state that audiences would be able to perceive, and if this would have any effect on their evaluation of the performance. A DMI was produced that incorporated a disfluent characteristic: It would turn itself off if not constantly moved. Six phys- ically identical instruments were produced, each in one of three versions: Control (no disfluent characteristics), mild disfluency (turned itself off slowly), and heightened disflu- ency (turned itself off more quickly). 6 percussionists each performed on one instrument for a live audience (N=31), and data was collected in the form of real-time feedback (via a mobile phone app), and post-hoc surveys. Though there was little difference in ratings of enjoyment between the versions of the instrument, the real-time and qualita- tive data suggest that disfluent behaviour in a DMI may be a way for audiences to perceive and appreciate performer skill.},
   author = {S M A BIN and N BRYAN-KINNS and A P MCPHERSON},
   month = {6},
  journal = {Proc. of the New Interfaces for Musical Expression (NIME)},
   title = {Risky business: Disfluency as a design strategy},
   year = {2018},
}


@inproceedings{Bromham2018,
   abstract = {© 2018 KASHYAP. Dynamic range compressors (DRC) are one of the most commonly used audio effect in music production. The timing settings are particularly important for controlling the manner in which they will shape an audio signal. We present a subjective user study of DRC, where a series of different compressor attack and release setting are varied and applied to a set of 30 sec audio tracks. Participants are then asked to rate which ballistic settings are most appropriate for the style of music in their judgement and asked to select one of a series of tag words, to describe the style or setting of the song. Results show that the attack parameter influences perceived style, more than the release parameter. From the study this is seen more evidently in the case of Jazz and Rock styles than in EDM or Hip-Hop. The area of intelligent Music production systems might benefit from this study in the future as it may help to inform appropriateness for certain DRC settings in varying styles.},
   author = {G Bromham and D Moffat and M Barthet and G Fazekas},
   journal = {145th Audio Engineering Society International Convention, AES 2018},
   month = {1},
   title = {The impact of compressor ballistics on the perceived style of music},
   year = {2018},
}
@inproceedings{bryan-kinns2018,
   author = {N Bryan-Kinns},
   doi = {10.14236/ewic/hci2018.98},
   issn = {1477-9358},
   journal = {Electronic Workshops in Computing (eWiC)},
   month = {7},
   title = {Case Study of Data Mining Mutual Engagement},
   year = {2018},
}
@article{bryan-kinns2018b,
   abstract = {© The Author(s) 2018. Co-creation across cultures is a fertile area for the study of design and human computer interaction. Many studies have examined what can be learnt from cultures across the world and how cultures respond to interactive technology, and yet open questions remain on how to engage people in cocreation across cultures. In this article, we reflect on a study of cross-cultural co-creation with the Kam ethnic minority group of China. We report on the kinds of collaboration and value that emerged through the co-creation of an interactive drama, and how a traditional Chinese literature composition method was used to structure the design process. We present a notation for describing cross-cultural co-creation and reflect on the careful balance that we found needed to be struck between the depth of co-creation, immersion in local culture, cultural exchange and interactivity. We report on the use of our approach to elicit serendipitous design opportunities in-situ and how our non-utilitarian approach allowed us to explore different meanings of 'interactivity' across cultures.},
   author = {N Bryan-Kinns and W Wang and T Ji},
   doi = {10.1093/iwc/iwy010},
   issn = {0953-5438},
   issue = {4},
   journal = {Interacting with Computers},
   month = {7},
   pages = {273-292},
   title = {Exploring interactivity and co-creation in rural China},
   volume = {30},
   year = {2018},
}
@inproceedings{bryan-kinns2018c,
   author = {N Bryan-Kinns and W Wang and Y Wu},
   doi = {10.14236/ewic/hci2018.214},
   issn = {1477-9358},
   journal = {Electronic Workshops in Computing (eWiC)},
   month = {7},
   title = {Thematic Analysis for Sonic Interaction Design},
   year = {2018},
}
@inproceedings{BUYS2018,
   abstract = {Although the physics of the bowed violin string are well understood, most audio feature extraction algorithms for violin still rely on general-purpose signal processing methods with latencies and accuracy rates that are unsuitable for real-time professional-calibre performance. Starting from a pickup which cleanly captures the motion of the bowed string with minimal colouration from the bridge and body, we present a lightweight time-domain method for modelling string motion using segmented linear regression. The algorithm leverages knowledge of the patterns of Helmholtz motion to produce a set of features which can be used for control of real-time synthesis processes. The goal of the paper is not a back-extraction of physical ground truth, but a responsive, low-latency feature space suitable for performance applications.},
   author = {K BUYS and A MCPHERSON},
   doi = {10.5281/zenodo.1422597},
   journal = {https://zenodo.org/record/1422597},
   month = {7},
   title = {Real-time bowed string feature extraction for performance applications},
   year = {2018},
}

@inproceedings{CHETTRI2018b,
   abstract = {Playing recorded speech samples of an enrolled speaker - "replay attack" - is a simple approach to bypass an automatic speaker verification (ASV) system. The vulnerability of ASV systems to such attacks has been acknowledged and studied, but there has been no research into what spoofing detection systems are actually learning to discriminate. In this paper, we analyse the local behaviour of a replay spoofing detection system based on convolutional neural networks (CNNs) adapted from a state-of-the-art CNN (LCNN-FFT) submitted at the ASVspoof 2017 challenge. We generate temporal and spectral explanations for predictions of the model using the SLIME algorithm. Our findings suggest that in most instances of spoofing the model is using information in the first 400 milliseconds of each audio instance to make the class prediction. Knowledge of the characteristics that spoofing detection systems are exploiting can help build less vulnerable ASV systems, other spoofing detection systems, as well as better evaluation databases.},
   author = {B CHETTRI and S MISHRA and B STURM and E BENETOS},
   journal = {http://www.slt2018.org/},
   month = {12},
   pages = {92-97},
   publisher = {IEEE},
   title = {Analysing the predictions of a CNN-based replay spoofing detection system},
   year = {2018},
}
@inproceedings{Choi2018,
   abstract = {In this paper, we empirically investigate the effect of audio preprocessing on music tagging with deep neural networks. While it is important to choose the best preprocessing strategy from an engineering perspective, it usually has been out of the focus in many academic research. We perform comprehensive experiments involving audio preprocessing using different time-frequency representations, logarithmic magnitude compression, frequency weighting, and scaling. We show that many commonly used input audio preprocessing techniques are redundant except logarithmic magnitude compression.},
   author = {K Choi and G Fazekas and M Sandler and K Cho},
   journal = {Proc. of the 26th European Signal Processing Conference (EUSIPCO 2018), 3-7 Sept, Rome, Italy},
   note = {keywords: Signal Processing, Deep Learning, MIR, Auto-tagging date-added: 2018-05-06 23:32:25 +0000 date-modified: 2018-05-29 23:32:25 +0000},
   title = {A Comparison of Audio Signal Preprocessing Methods for Deep Neural Networks on Music Tagging},
   year = {2018},
}
@article{Choi2018b,
   abstract = {Deep neural networks (DNN) have been successfully applied to music classification including music tagging. However, there are several open questions regarding the training, evaluation, and analysis of DNNs. In this article, we investigate specific aspects of neural networks, the effects of noisy labels, to deepen our understanding of their properties. We analyse and (re-)validate a large music tagging dataset to investigate the reliability of training and evaluation. Using a trained network, we compute label vector similarities which is compared to groundtruth similarity. The results highlight several important aspects of music tagging and neural networks. We show that networks can be effective despite relatively large error rates in groundtruth datasets, while conjecturing that label noise can be the cause of varying tag-wise performance differences. Lastly, the analysis of our trained network provides valuable insight into the relationships between music tags. These results highlight the benefit of using data-driven methods to address automatic music tagging.},
   author = {K Choi and G Fazekas and M Sandler and K Cho},
   doi = {10.1109/TETCI.2017.2771298},
   issue = {2},
   journal = {IEEE Transactions on Emerging Topics in Computational Intelligence},
   month = {3},
   note = {date-added: 2018-06-06 23:32:25 +0000 date-modified: 2018-05-06 23:32:25 +0000 keywords: evaluation, music tagging, deep learning, CNN bdsk-url-1: https://arxiv.org/pdf/1706.02361.pdf bdsk-url-2: https://dx.doi.org/10.1109/TETCI.2017.2771298},
   pages = {139-149},
   publisher = {IEEE},
   title = {The Effects of Noisy Labels on Deep Convolutional Neural Networks for Music Tagging},
   volume = {2},
   url = {http://semanticaudio.net/},
   year = {2018},
}
@inproceedings{CHOURDAKIS2018,
   abstract = {A radio play is a form of drama which exists in the acoustic domain and is usually consumed over broadcast radio.
In this paper a method is proposed that, given a story in the
form of unstructured text, produces a radio play that tells
this story. First, information about characters, acting lines,
and environments is retrieved from the text. The information extracted serves to generate a production script which
can be used either by producers of radiodrama, or subsequently used to automatically generate the radio play as an
audio file. The system is evaluated in two parts: precision,
recall, and f1 scores are computed for the information retrieval part while multistimulus listening tests are used for
subjective evaluation of the generated audio.},
   author = {E T CHOURDAKIS and JOSHUAD REISS},
   month = {7},
   title = {From my pen to your ears: automatic production of radio plays from unstructured story text},
   url = {https://scholar.google.co.uk/citations?hl=en&user=Hf0rcRcAAAAJ},
   year = {2018},
}
@article{Dixon2018,
   author = {S Dixon and E Gómez and A Volk},
   doi = {10.5334/tismir.22},
   issn = {2514-3298},
   issue = {1},
   journal = {Transactions of the International Society for Music Information Retrieval},
   month = {1},
   pages = {1-3},
   publisher = {Ubiquity Press},
   title = {Editorial: Introducing the Transactions of the International Society for Music Information Retrieval},
   volume = {1},
   year = {2018},
}
@inproceedings{Droog2018,
   abstract = {© Proceedings of AISB Annual Convention 2018. All rights reserved. Automatic summarization is dominated by approaches which focus on the selection and concatenation of material in a text. What can be achieved by such approaches is intrinsically limited and far below what can be achieved by human summarizers. There is evidence that successfully creating a rich representation of text, including details of its narrative structure, would help to create more human-like summaries. This paper describes a part of our ongoing work on a cognitively inspired, creative approach to summarization. Here we detail our work on the detection of narrative structure in order to help build rich interpretations of a text and help give rise to a creative approach to summarization. In particular we consider the domain of Russian folktales. Using Vladimir Propp’s thorough description of the interrelations between the narrative elements of such tales, we pose this task as a constraint satisfaction problem. While we only consider this small domain, our approach can be applied to any domain of text on which enough constraints can be placed.},
   author = {M Droog-Hayes and G Wiggins and M Purver},
   journal = {Proceedings of AISB Annual Convention 2018},
   month = {1},
   title = {Automatic detection of narrative structure for high-level story representation},
   year = {2018},
}
@article{Duffy2018,
   abstract = {© 2018, Hacettepe University. All rights reserved. Whilst the focus of attention in an instrumental music lesson is refinement of the student’s musical performance, conversation plays an essential role; not just as a way to analyse the student’s musical contributions, but to organise them within the lesson flow. Participants may respond to talk through performance and vice versa, or even spend periods of time exchanging purely musical contributions. The short musical fragments exchanged by the participants are managed within lesson dialogue in ways analogous to conversational turn-taking. Problems in the student’s performance are refined through both student self-initiated and tutor other-initiated repair, initiated by embodied action and play. A fundamental part of turn-taking is managing the transition to a new speaker. The presence of musical contributions allows for additional types of transition, for example from a turn at talk, to a musical contribution. In conversation, there is generally a preference for a short pause at the transition to a new speaker, and overlap tends to be minimised when it occurs. Through detailed qualitative video analysis of a one-to-one clarinet lesson, we find differences in the preferences regarding overlap when purely musical contributions are being exchanged, and that the duration of overlap during these exchanges of fragments of music are significant.},
   author = {S Duffy and P G T Healey},
   doi = {10.16986/HUJE.2018038809},
   issn = {1300-5340},
   issue = {Special Issue},
   journal = {Hacettepe Egitim Dergisi},
   month = {1},
   pages = {316-333},
   title = {Refining musical performance through overlap},
   volume = {33},
   year = {2018},
}
@article{Duffy2018b,
   abstract = {Clapping Music is a minimalist work by Steve Reich based on twelve phased variations of a rhythmic pattern. It has been reimagined as a game-based mobile application, designed with a dual purpose. First, to introduce new audiences to the Minimalist genre through interaction with the piece presented as an engaging game. Second, to use large-scale data collection within the app to address research questions about the factors determining rhythm production performance. The twelve patterns can be differentiated using existing theories of rhythmic complexity. Using performance indicators from the game such as tap accuracy we can determine which patterns players found most challenging and so assess hypotheses from theoretical models with empirical evidence. The app has been downloaded over 140,000 times since the launch in July 2015, and over 46 million rows of gameplay data have been collected, requiring a big data approach to analysis. The results shed light on the rhythmic factors contributing to performance difficulty and show that the effect of making a transition from one pattern to the next is as significant, in terms of pattern difficulty, as the inherent complexity of the pattern itself. Challenges that arose in applying this novel approach are discussed.},
   author = {S Duffy and M Pearce},
   doi = {10.1371/journal.pone.0205847},
   issue = {10},
   journal = {PLoS One},
   month = {10},
   pages = {e0205847–e0205847},
   title = {What makes rhythms hard to perform? An investigation using Steve Reich's Clapping Music.},
   volume = {13},
   url = {https://www.ncbi.nlm.nih.gov/pubmed/30335798},
   year = {2018},
}
@inproceedings{Fano2018,
   abstract = {Kernel Additive Modelling (KAM) is a framework for source separation aiming to explicitly model inherent properties of sound sources to help with their identification and separation. KAM separates a given source by applying robust statistics on the selection of time-frequency bins obtained through a source-specific kernel, typically the k-NN function. Even though the parameter k appears to be key for a successful separation, little discussion on its influence or optimisation can be found in the literature. Here we propose a novel method, based on graph theory statistics, to automatically optimise k in a vocal separation task. We introduce the k-NN hubness as an indicator to find a tailored k at a low computational cost. Subsequently, we evaluate our method in comparison to the common approach to choose k. We further discuss the influence and importance of this parameter with illuminating results.},
   author = {D Fano Yela and D Stowell and M Sandler},
   doi = {10.1007/978-3-319-93764-9_27},
   isbn = {9783319937632},
   issn = {0302-9743},
   journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
   month = {6},
   pages = {280-289},
   title = {Does K matter? k-NN hubness analysis for kernel additive modelling vocal separation},
   volume = {10891 LNCS},
   year = {2018},
}
@inproceedings{Flynn2018,
   abstract = {© 2018 Audio Engineering Society. All Rights Reserved. Current closed-form IIR methods for approximating an analogue prototype filter in the discrete-domain do not match frequency response phase. The frequency sampling method can match phase, but requires extremely long filter lengths (and corresponding latency) to perform well at low frequencies. We propose a method for discretising an analogue prototype that does not succumb to these issues. Contrary to the IIR methods, it accurately approximates the phase, as well as the magnitude response. The proposed method exhibits good low frequency resolution using much smaller filter lengths than design by frequency sampling.},
   author = {J Flynn and J D Reiss},
   journal = {144th Audio Engineering Society Convention 2018},
   month = {1},
   title = {Improving the frequency response magnitude and phase of analogue-matched digital filters},
   year = {2018},
}
@inproceedings{Freeman2018,
   abstract = {© 2018 ISAST. How can we describe data when used as an art material? As the number of artists using data in their work increases, so too must our ability to describe the material in a way that is understood by both specialist and general audiences alike. Based on a review of existing vocabularies, glossaries and taxonomies of data, we propose our own concise taxonomy. To conclude, we propose the adoption of this concise taxonomy by artists, critics and curators, and suggest that ongoing refinement of the taxonomy takes place through crowdsourced knowledge sharing on the Web.},
   author = {J Freeman and G Wiggins and G Starks and M Sandler},
   doi = {10.1162/LEON_a_01414},
   issn = {0024-094X},
   issue = {1},
   journal = {Leonardo},
   month = {2},
   pages = {75-79},
   title = {A concise taxonomy for describing data as an art material},
   volume = {51},
   year = {2018},
}
@inproceedings{Frieler2018,
   abstract = {© Klaus Frieler, Frank Höger, Martin Pfleiderer, Simon Dixon. This paper presents two novel user interfaces for investigating the pattern content in monophonic jazz solos and exemplifies how these interfaces could be used for research on jazz improvisation. In jazz improvisation, patterns are of particular interest for the analysis of improvisation styles, the oral transmission of musical language, the practice of improvisation, and the psychology of creative processes. The ongoing project “Dig That Lick” is devoted to addressing these questions with the help of a large database of jazz solo transcriptions generated by automated melody extraction algorithms. To expose these transcriptions to jazz researchers, two prototypes of user interfaces were designed that work currently with the 456 manually transcribed jazz solos of the Weimar Jazz Database. The first one is a Shiny application that allows exploring a set of 653 of the most common patterns by eminent players. The second one is a web interface for a general two-staged pattern search in the Weimar Jazz Database featuring regular expressions. These applications aim on the one hand at an expert audience of jazz researchers to facilitate generating and testing hypotheses about patterns in jazz improvisation, and on the other hand at a wider audience of jazz teachers, students, and fans.},
   author = {K Frieler and F Höger and M Pfleiderer and S Dixon},
   isbn = {9782954035123},
   journal = {Proceedings of the 19th International Society for Music Information Retrieval Conference, ISMIR 2018},
   month = {1},
   pages = {777-783},
   title = {Two web applications for exploring melodic patterns in jazz solos},
   year = {2018},
}
@inproceedings{Galindo2018,
   abstract = {© 2018 Association for Computing Machinery. This paper describes an interactive scenography designed to enhance the use of embodied imagination in a stroke survivors' performance workshop called The Green Screening workshop, conceived by the company Split Britches.We explore performance techniques combined with live motion capture to provide participants with an abstract visual world that helps them to enact fantasy scenes they create in front of an audience. A simple interface provides real-time visualisations of participant's body movements in three different scenarios and promotes engagement with the co-present audience. The system was evaluated in two workshops with stroke survivors. The results indicate that the system is effective in encouraging participants' creative use of embodied improvisation.},
   author = {R P Galindo Esparza and P G T Healey and L Weaver and M Delbridge},
   doi = {10.1145/3212721.3212845},
   isbn = {9781450365048},
   journal = {ACM International Conference Proceeding Series},
   month = {6},
   title = {Augmented embodiment: Developing interactive technology for stroke survivors short paper},
   year = {2018},
}
@article{GODDARD2018,
   abstract = {Computationally creative systems require semantic information when reflecting or self reasoning on their output. In this paper, we outline the design of a computationally creative musical performance system aimed at producing virtuosic interpretations of musical pieces and provide an overview of its implementation. The case-based reasoning part of the system relies on a measure of musical similarity based on the FANTASTIC and SynPy toolkits which provide melodic and syncopated rhythmic features, respectively. We conducted a listening test based on pair-wise comparison to assess to what extent the machine-based similarity models match human perception. We found the machine-based models to differ significantly to human responses due to differences in participants' responses. The best performing model relied on features from the FANTASTIC toolkit obtaining a rank match rate with human response of 63%, whilst features from the SynPy toolkit only obtained a ranking match rate of 46%. Whilst more work is needed on a stronger model of similarity, we do not believe these results prevent FANTASTIC features being used as a measure of similarity within creative systems.},
   author = {C GODDARD and M BARTHET and G WIGGINS},
   doi = {10.17743/jaes.2018.0012},
   issn = {1549-4950},
   journal = {Journal of the Audio Engineering Society},
   month = {4},
   publisher = {Audio Engineering Society},
   title = {Assessing Musical Similarity for Computational Music Creativity},
   year = {2018},
}
@inproceedings{Goodman2018,
   abstract = {© 2018 IEEE. As wearable technologies and etextiles mature they are being increasingly used in couture and high street fashion. However, much of the innovation in this space has been driven by technological and commercial imperatives. As the wearables and etextile markets mature it is time to consider this technological landscape in the bigger picture of a sustainable human-centred world. This paper reports on initial findings from 48 projects supported through the EU funded WEAR Sustain network to examine sustainable and ethical approaches to wearable technology design. Case studies of collaborations between artists and technologists in designing and realising sustainable and ethical wearable technologies are presented, and an initial set of themes emerging from detailed analysis of all the project updates are outlined highlighting the importance of cross-disciplinary hubs, mentors, and networks.},
   author = {L Goodman and N Bryan-Kinns and Y Wu and S Liu and C Baker},
   doi = {10.1109/GEM.2018.8516276},
   isbn = {9781538663042},
   journal = {2018 IEEE Games, Entertainment, Media Conference, GEM 2018},
   month = {10},
   pages = {1-3},
   title = {WEAR Sustain Network: Ethical and Sustainable Technology Innovation in Wearables and Etextiles},
   year = {2018},
}
@article{Healey2018,
   abstract = {Miscommunication is a neglected issue in the cognitive sciences, where it has often been discounted as noise in the system. This special issue argues for the opposite view: Miscommunication is a highly structured and ubiquitous feature of human interaction that systematically underpins people's ability to create and maintain shared languages. Contributions from conversation analysis, computational linguistics, experimental psychology, and formal semantics provide evidence for these claims. They highlight the multi-modal, multi-person character of miscommunication. They demonstrate the incremental, contingent, and locally adaptive nature of the processes people use to detect and deal with miscommunication. They show how these processes can drive language change. In doing so, these contributions introduce an alternative perspective on what successful communication is, new methods for studying it, and application areas where these ideas have a particular impact. We conclude that miscommunication is not noise but essential to the productive flexibility of human communication, especially our ability to respond constructively to new people and new situations.},
   author = {P G T Healey and J P de Ruiter and G J Mills},
   doi = {10.1111/tops.12340},
   issue = {2},
   journal = {Top Cogn Sci},
   month = {5},
   pages = {264-278},
   title = {Editors' Introduction: Miscommunication.},
   volume = {10},
   url = {https://www.ncbi.nlm.nih.gov/pubmed/29749040},
   year = {2018},
}
@generic{HEALEY2018b,
   author = {P G T HEALEY and M R J PURVER},
   month = {11},
   title = {Self-Repetition in Dialogue and Monologue},
   url = {http://www.eecs.qmul.ac.uk/~mpurver/papers/healey-purver18semdial.pdf},
   year = {2018},
}
@article{Healey2018c,
   abstract = {People give feedback in conversation: both positive signals of understanding, such as nods, and negative signals of misunderstanding, such as frowns. How do signals of understanding and misunderstanding affect the coordination of language use in conversation? Using a chat tool and a maze-based reference task, we test two experimental manipulations that selectively interfere with feedback in live conversation: (a) "Attenuation" that replaces positive signals of understanding such as "right" or "okay" with weaker, more provisional signals such as "errr" or "umm" and (2) "Amplification" that replaces relatively specific signals of misunderstanding from clarification requests such as "on the left?" with generic signals of trouble such as "huh?" or "eh?". The results show that Amplification promotes rapid convergence on more systematic, abstract ways of describing maze locations while Attenuation has no significant effect. We interpret this as evidence that "running repairs"-the processes of dealing with misunderstandings on the fly-are key drivers of semantic coordination in dialogue. This suggests a new direction for experimental work on conversation and a productive way to connect the empirical accounts of Conversation Analysis with the representational and processing concerns of Formal Semantics and Psycholinguistics.},
   author = {P G T Healey and G J Mills and A Eshghi and C Howes},
   doi = {10.1111/tops.12336},
   issue = {2},
   journal = {Top Cogn Sci},
   month = {4},
   pages = {367-388},
   title = {Running Repairs: Coordinating Meaning in Dialogue.},
   volume = {10},
   url = {https://www.ncbi.nlm.nih.gov/pubmed/29687611},
   year = {2018},
}

@inproceedings{Heitlinger2018,
   abstract = {© 2018 ACM. We present a case study of a participatory design project in the space of sustainable smart cities and Internet of Things. We describe our design process that led to the development of an interactive seed library that tells the stories of culturally diverse urban food growers, and networked environmental sensors from their gardens, as a way to support more sustainable food practices in the city. This paper contributes to an emerging body of empirical work within participatory design that seeks to involve citizens in the design of smart cities and Internet of Things, particularly in the context of marginalised and culturally diverse urban communities. It also contributes empirical work towards non-utilitarian approaches to sustainable smart cities through a discussion of designing for urban diversity and slowness.},
   author = {S Heitlinger and N Bryan-Kinns and R Comber},
   doi = {10.1145/3210604.3210620},
   isbn = {9781450364645},
   journal = {ACM International Conference Proceeding Series},
   month = {9},
   title = {Connected seeds and sensors: Co-designing internet of things for sustainable smart cities with urban food-growing communities},
   volume = {2},
   year = {2018},
}
@article{Hu2018,
   abstract = {© 2018, Springer Nature B.V. Design thinking holds the key to innovation processes, but is often difficult to detect because of its implicit nature. We undertook a study of novice designers engaged in team-based design exercises in order to explore the correlation between design thinking and designers’ physical (observable) behavior and to identify new, objective, design thinking identification methods. Our study addresses the topic by using data collection method of “think aloud” and data analysis method of “protocol analysis” along with the unconstrained concept generation environment. Collected data from the participants without service design experience were analyzed by open and selective coding. Through the research, we found correlations between physical activity and divergent thinking, and also identified physical behaviors that predict a designer’s transition to divergent thinking. We conclude that there are significant relations between designers’ design thinking and the behavioral features of their body and face. This approach opens possible new ways to undertake design process research and also design capability evaluation.},
   author = {Y Hu and X Du and N Bryan-Kinns and Y Guo},
   doi = {10.1007/s10798-018-9479-7},
   issn = {0957-7572},
   journal = {International Journal of Technology and Design Education},
   month = {10},
   title = {Identifying divergent design thinking through the observable behavior of service design novices},
   year = {2018},
}
@article{JACK2018,
   abstract = {Asynchrony between tactile and auditory feedback (action-sound latency) when playing a musical instrument is widely recognised as disruptive to musical performance. In this paper we present a study that assesses the effects of delayed auditory feedback on the timing accuracy and judgements of instrument quality for two groups of participants: professional percussionists and non-percussionist amateur musicians. The amounts of delay tested in this study are relatively small in comparison to similar studies of auditory delays in a musical context (0ms, 10ms, 10ms±3ms, 20ms). We found that both groups rated the zero latency condition as higher quality for a series of quality measures in comparison to 10ms±3ms and 20ms latency, but did not show a significant difference in rating between 10ms latency and zero latency. Professional percussionists were more aware of the latency conditions and showed less variation of timing under the latency conditions, although this ability decreased as the temporal demands of the task increased. We compare our findings from each group and discuss them in relation to latency in interactive digital systems more generally and experimentally similar work on sensorimotor control and rhythmic performance.},
   author = {R H JACK and A MEHRABI and T Stockman and A MCPHERSON},
   doi = {10.1525/mp.2018.36.1.109},
   editor = {K Stevens and K Hutchings},
   issn = {0730-7829},
   journal = {Music Perception},
   month = {8},
   publisher = {University of California Press},
   title = {Action-sound Latency and the Perceived Quality of Digital Musical Instruments: Comparing Professional Percussionists and Amateur Musicians},
   year = {2018},
}
@inproceedings{Jillings2018,
   abstract = {© 2018 KASHYAP. Subjective experiments are a cornerstone of modern research, with a variety of tasks being undertaken by subjects. In the field of audio, subjective listening tests provide validation for research and aid fair comparison between techniques or devices such as coding performance, speakers, mixes and source separation systems. Several interfaces have been designed to mitigate biases and to standardise procedures, enabling indirect comparisons. The number of different combinations of interface and test design make it extremely difficult to conduct a truly unbiased listening test. This paper resolves the largest of these variables by identifying the impact the interface itself has on a purely auditory test. This information is used to make recommendations for specific categories of listening tests.},
   author = {N Jillings and B De Man and R Stables and J D Reiss},
   journal = {145th Audio Engineering Society International Convention, AES 2018},
   month = {1},
   title = {Investigation into the effects of subjective test interface choice on the validity of results},
   year = {2018},
}

@book_section{KUDUMAKIS2018b,
   author = {P KUDUMAKIS and J Corral García and I Barbancho and L J. Tardón and M SANDLER},
   doi = {10.1007/978-3-662-55004-5_45},
   editor = {R Bader},
   issue = {45},
   journal = {Springer Handbook of Systematic Musicology},
   month = {1},
   pages = {911-921},
   publisher = {Springer, Berlin, Heidelberg},
   title = {Enabling Interactive and Interoperable Semantic Music Applications},
   year = {2018},
}
@book_section{Lavia2018,
   abstract = {© 2018 by IGI Global. All rights reserved. More accurate non-participatory parameters and psychoacoustics to assess human perceptual responses to the acoustic environment are critical to inform effective urban sound planning and applied soundscape practice. Non-participatory observation methods are widely used by experts to capture animal behavior. In 2012, Lavia and Witchel applied these principles and methodologies for the first time to capturing and assessing human behavior "in the wild" to changes to the acoustic environment using added sound and music interventions in a clubbing district. Subsequent work was conducted with Aletta and Kang and Healey, Howes, Steffens, and Fiebig to begin characterizing the acoustic environment and human responses to align the perceptual and physical findings. Here, the authors report on new work and analysis and propose a preliminary predictive agile applied soundscape framework using non-participatory observation methods and psychoacoustics to be used with environmental assessment practice and evolving urban soundscape planning methods by researchers, practitioners, and policy makers.},
   author = {L Lavia and H J Witchel and F Aletta and J Steffens and A Fiebig and J Kang and C Howes and P G T Healey},
   doi = {10.4018/978-1-5225-3637-6.ch004},
   isbn = {152253637X},
   journal = {Handbook of Research on Perception-Driven Approaches to Urban Assessment and Design},
   month = {1},
   pages = {73-98},
   title = {Non-participant observation methods for soundscape design and urban panning},
   year = {2018},
}
@inproceedings{Li2018,
   abstract = {© 2018 Technical Committee on Control Theory, Chinese Association of Automation. Analysing expressive timing in performed music can help machine to perform various perceptual tasks such as identifying performers and understand music structures in classical music. A hierarchical structure is commonly used for expressive timing analysis. This paper provides a statistical demonstration to support the use of hierarchical structure in expressive timing analysis by presenting two groups of model selection tests. The first model selection test uses expressive timing to determine the location of music structure boundaries. The second model selection test is matching a piece of performance with the same performer playing another given piece. Comparing the results of model selection tests, the preferred hierarchical structures in these two model selection tests are not the same. While determining music structure boundaries demands a hierarchical structure with more levels in the expressive timing analysis, a hierarchical structure with less levels helps identifying the dedicated performer in most cases.},
   author = {S Li and S Dixon and M D Plumbley},
   doi = {10.23919/ChiCC.2018.8483169},
   isbn = {9789881563941},
   issn = {1934-1768},
   journal = {Chinese Control Conference, CCC},
   month = {10},
   pages = {3190-3195},
   title = {A Demonstration of Hierarchical Structure Usage in Expressive Timing Analysis by Model Selection Tests},
   volume = {2018-July},
   year = {2018},
}
@article{Liang2018,
   abstract = {This paper presents a study of piano pedalling gestures and techniques on the sustain pedal from the perspective of measurement, recognition and visualisation. Pedalling gestures can be captured by a dedicated measurement system where the sensor data can be simultaneously recorded alongside the piano sound under normal playing conditions. Using the sensor data collected from the system, the recognition is comprised of two separate tasks: pedal onset/offset detection and classification by technique. The onset and offset times of each pedalling technique were computed using signal processing algorithms. Based on features extracted from every segment when the pedal is pressed, the task of classifying the segments by pedalling technique was undertaken using machine learning methods. We compared Support Vector Machines (SVM) and hidden Markov models (HMM) for this task. Our system achieves high accuracies, over 0.7 F1 score for all techniques and over 0.9 on average. The recognition results can be represented using novel pedalling notations and visualised in an audio-based score following application.},
   author = {B Liang and G Fazekas and M Sandler},
   doi = {10.17743/jaes.2018.0035},
   issue = {47},
   journal = {JAES Special Issue on Participatory Sound And Music Interaction Using Semantic Audio},
   month = {6},
   note = {date-added: 2018-06-06 23:32:25 +0000 date-modified: 2018-05-06 23:32:25 +0000 keywords: sensor system, piano pedalling, measurement, machine learning, gesture recognition, piano transcription},
   pages = {xxxx–xxxx},
   title = {Measurement, Recognition and Visualisation of Piano Pedalling Gestures and Techniques},
   volume = {2},
   year = {2018},
}


@inproceedings{Liang2018b,
   abstract = {In this paper, the problem of legato pedalling technique detection in polyphonic piano music is addressed. We propose a novel detection method exploiting the effect of sympathetic resonance which can be enhanced by a legato-pedal onset. To measure the effect, specific piano transcription was performed using the templates of pre-recorded isolated notes, from which partial frequencies were estimated. This promotes the acquisition of residual components associated to the weak co-excitation of damped notes due to the legato pedalling technique. Features that represent the sympathetic resonance measure were extracted from residuals. We finally used a logistic regression classifier to distinguish the existence of legato-pedal onsets.},
   author = {B Liang and G Fazekas and M Sandler},
   doi = {10.23919/EUSIPCO.2018.8553341},
   journal = {Proceedings of the 26th European Signal Processing Conference (EUSIPCO 2018)},
   month = {9},
   pages = {2484-2488},
   publisher = {IEEE},
   title = {Piano Legato-Pedal Onset Detection Based on a Sympathetic Resonance Measure},
   year = {2018},
}
@inproceedings{Marengo2018,
   abstract = {The digitization of art collections is a great opportunity to engage audiences beyond the context of the museum visit. Interfaces to access collections have been initially tailored for professional search tasks: the new challenge is how to design systems for open, casual, and leisure-based explorations. In a human-centered framework, the users’ perspective is a fundamental step to design and improve creative solutions. How can we listen to and understand the potential users, in order to design meaningful experiences? How can we collect insights, and what do these tell us about the users and the systems? We explore the use of inquiry techniques as a method to surface the curiosities people have for paintings. During two iterations, visitors of public events wrote questions they had about selected paintings. 138 Post-its were collected and thematically analyzed. Results highlight that curiosities are contextualized, and that artworks are interpreted mainly as scenes. People are interested in meanings and symbols; they also displayed the use of fantasy and empathy. Additionally, we evaluated the effect of age, previous knowledge of the painting, and frequency of visiting museums on the questions’ content through statistical analysis. While no strong finding emerged, we noticed that adults and kids likewise display an active role in the inquiry process, and that a previous knowledge of the painting is connected to more descriptive and atomic curiosities. In the discussion, we suggest design opportunities might lay in the interactive discovery of information, in storytelling-based descriptions, and in emotional connection. Our findings suggest that in leisure-based explorations atomic information might not be satisfying, and that descriptions should be contextualized to the painting. Our presentation will be an opportunity to discuss the value of the method, and to comment on how the insights could be embedded into the design of leisure-based experiences.},
   author = {L Marengo and G Fazekas and A Tombros},
   journal = {Proc. International Conference on Museums and the Web 2018, April 18-21, Vancouver, Canada.},
   note = {date-added: 2018-05-01 00:11:04 +0000 date-modified: 2018-05-01 00:16:25 +0000 keywords: visual art, information design, inquiry techniques, user requirements, online collections, interaction design bdsk-url-1: http://mw18.mwconf.org/paper/i-wonder-inquiry-techniques-as-a-method-to-gain-insights-into-peoples-encounters-with-visual-art},
   title = {I Wonder... Inquiry Techniques As A Method To Gain Insights Into People’s Encounters With Visual Art},
   url = {http://mw18.mwconf.org/paper/i-wonder-inquiry-techniques-as-a-method-to-gain-insights-into-peoples-encounters-with-visual-art},
   year = {2018},
}
@inproceedings{martinez2018,
   abstract = {This work aims to implement a novel deep learning architecture to perform audio processing in the context of matched equalization. Most existing methods for automatic and matched equalization show effective performance and their goal is to find a respective transfer function given a frequency response. Neverthe-less, these procedures require a prior knowledge of the type offilters to be modeled. In addition, fixed filter bank architecturesare required in automatic mixing contexts. Based on end-to-endconvolutional neural networks, we introduce a general purpose ar-chitecture for equalization matching. Thus, by using an end-to-end learning approach, the model approximates the equalizationtarget as a content-based transformation without directly findingthe transfer function. The network learns how to process the au-dio directly in order to match the equalized target audio. We trainthe network through unsupervised and supervised learning proce-dures. We analyze what the model is actually learning and howthe given task is accomplished. We show the model performing matched equalization forshelving, peaking, lowpass and highpass IIR and FIR equalizers.},
   author = {M Martinez Ramirez and J Reiss},
   month = {9},
   title = {End-to-end equalization with convolutional neural networks},
   url = {http://www.m-marco.com/},
   year = {2018},
}
@inproceedings{McArthur2018,
   abstract = {© Audio Engineering Society. All rights reserved. This study examines auditory distance discrimination in cinematic virtual reality. It uses controlled stimuli with audio-visual distance variations, to determine if mismatch stimuli are detected. It asks if visual conditions - either equally or unequally distanced from the user, and environmental conditions - either a reverberant space as opposed to a freer field, impact accuracy in discrimination between congruent and incongruent aural and visual cues. A Repertory Grid Technique-derived design is used, whereby participant-specific constructs are translated into numerical ratings. Discrimination of auditory event mismatch was improved for stimuli with varied visual-event distances, though not for equidistant visual events. This may demonstrate that visual cues alert users to matches and mismatches.},
   author = {A McArthur and M Sandler and R Stewart},
   isbn = {9781510870390},
   journal = {Proceedings of the AES International Conference},
   month = {1},
   pages = {24-33},
   title = {Perception of mismatched auditory distance - Cinematic VR},
   volume = {2018-August},
   year = {2018},
}
@article{McCabe2018,
   abstract = {Copyright © 2018 Cognitive Science Society, Inc. The effectiveness of medical treatment depends on the quality of the patient–clinician relationship. It has been proposed that this depends on the extent to which the patient and clinician build a shared understanding of illness and treatment. Here, we use the tools of conversation analysis (CA) to explore this idea in the context of psychiatric consultations. The CA “repair” framework provides an analysis of the processes people use to deal with problems in speaking, hearing, and understanding. These problems are especially critical in the treatment of psychosis where patients and health care professionals need to communicate about the disputed meaning of hallucinations and delusion. Patients do not feel understood, they are frequently non-adherent with treatment, and many have poor outcomes. We present an overview of two studies focusing on the role of repair as a mechanism for producing and clarifying meaning in psychiatrist–patient communication and its association with treatment outcomes. The first study shows patient clarification or repair of psychiatrists’ talk is associated with better patient adherence to treatment. The second study shows that training which emphasizes the importance of building an understanding of patients’ psychotic experiences increases psychiatrists’ self-repair. We propose that psychiatrists are working harder to make their talk understandable and acceptable to the patient by taking the patient's perspective into account. We conclude that these findings provide evidence that repair is an important mechanism for building shared understanding in doctor–patient communication and contributes to better therapeutic relationships and treatment adherence. The conversation analytic account of repair is currently the most sophisticated empirical model for analyzing how people construct shared meaning and understanding. Repair appears to reflect greater commitment to and engagement in communication and improve both the quality and outcomes of communication. Reducing potential miscommunication between psychiatrists and their patients with psychosis is a low-cost means of enhancing treatment from both the psychiatrist and patient perspective. Given that misunderstanding and miscommunication are particularly problematic in psychosis, this is critical for improving the longer term outcomes of treatment for these patients who often have poor relationships with psychiatrists and health care services more widely.},
   author = {R McCabe and P G T Healey},
   doi = {10.1111/tops.12337},
   issn = {1756-8757},
   issue = {2},
   journal = {Topics in Cognitive Science},
   month = {4},
   pages = {409-424},
   title = {Miscommunication in Doctor–Patient Communication},
   volume = {10},
   year = {2018},
}
@inproceedings{Mehrabi2018,
   abstract = {© 2018 IEEE. The expressive nature of the voice provides a powerful medium for communicating sonic ideas, motivating recent research on methods for query by vocalisation. Meanwhile, deep learning methods have demonstrated state-of-the-art results for matching vocal imitations to imitated sounds, yet little is known about how well learned features represent the perceptual similarity between vocalisations and queried sounds. In this paper, we address this question using similarity ratings between vocal imitations and imitated drum sounds. We use a linear mixed effect regression model to show how features learned by convolutional auto-encoders (CAEs) perform as predictors for perceptual similarity between sounds. Our experiments show that CAEs outperform three baseline feature sets (spectrogram-based representations, MFCCs, and temporal features) at predicting the subjective similarity ratings. We also investigate how the size and shape of the encoded layer effects the predictive power of the learned features. The results show that preservation of temporal information is more important than spectral resolution for this application.},
   author = {A Mehrabi and K Choi and S Dixon and M Sandler},
   doi = {10.1109/ICASSP.2018.8461566},
   isbn = {9781538646588},
   issn = {1520-6149},
   journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
   month = {9},
   pages = {356-360},
   title = {Similarity Measures for Vocal-Based Drum Sample Retrieval Using Deep Convolutional Auto-Encoders},
   volume = {2018-April},
   year = {2018},
}
@inproceedings{Men2018,
   author = {L Men and N Bryan-Kinns},
   doi = {10.1109/SIVE.2018.8577094},
   isbn = {9781538657133},
   journal = {2018 IEEE 4th VR Workshop on Sonic Interactions for Virtual Environments, SIVE 2018},
   month = {12},
   title = {LeMo: Supporting Collaborative Music Making in Virtual Reality},
   year = {2018},
}
@article{Mesaros2018,
   abstract = {Public evaluation campaigns and datasets promote active development in target research areas, allowing direct comparison of algorithms. The second edition of the challenge on detection and classification of acoustic scenes and events (DCASE 2016) has offered such an opportunity for development of the state-of-the-art methods, and succeeded in drawing together a large number of participants from academic and industrial backgrounds. In this paper, we report on the tasks and outcomes of the DCASE 2016 challenge. The challenge comprised four tasks: acoustic scene classification, sound event detection in synthetic audio, sound event detection in real-life audio, and domestic audio tagging. We present each task in detail and analyze the submitted systems in terms of design and performance. We observe the emergence of deep learning as the most popular classification method, replacing the traditional approaches based on Gaussian mixture models and support vector machines. By contrast, feature representations have not changed substantially throughout the years, as mel frequency-based representations predominate in all tasks. The datasets created for and used in DCASE 2016 are publicly available and are a valuable resource for further research.},
   author = {A Mesaros and T Heittola and E Benetos and P Foster and M Lagrange and T Virtanen and M Plumbley},
   doi = {10.1109/TASLP.2017.2778423},
   issn = {2329-9304},
   issue = {2},
   journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing},
   month = {2},
   pages = {379-393},
   publisher = {Institute of Electrical and Electronics Engineers},
   title = {Detection and Classification of Acoustic Scenes and Events: Outcome of the DCASE 2016 Challenge},
   volume = {26},
   url = {http://ieeexplore.ieee.org/document/8123864/},
   year = {2018},
}
@book_section{Milo2018,
   abstract = {© 2018 by IGI Global. All rights reserved. This chapter presents an overview of 3 graphical tools supporting soundscape assessment in different settings, indoors and outdoors. These research prototypes support the spatial organization of the perceptual information available to the participants and are designed based on surveying techniques used in architectural training to create a foundation for acoustic design education in architecture schools. This chapter reports the contexts of the focus groups investigations, presenting advantages and drawbacks related to their use. It has been found that participants often added explanatory verbal data and arrows to the provided diagrams. The diagrams and their use have been interpreted with the support of the qualitative data captured along the studies through thematic analysis. Finally, paper prototypes are useful for educational approaches, but future more comprehensive studies will require integrating these tools in existing or yet-to-be-designed systematic frameworks for soundscape analysis and design.},
   author = {A Milo and N Bryan-Kinns and J D Reiss},
   doi = {10.4018/978-1-5225-3637-6.ch017},
   isbn = {152253637X},
   journal = {Handbook of Research on Perception-Driven Approaches to Urban Assessment and Design},
   month = {1},
   pages = {397-433},
   title = {Graphical research tools for acoustic design training: Capturing perception in architectural settings},
   year = {2018},
}
@inproceedings{Mishra2018,
   abstract = {© Saumitra Mishra, Bob L. Sturm, Simon Dixon. Methods for interpreting machine learning models can help one understand their global and/or local behaviours, and thereby improve them. In this work, we apply a global analysis method to a machine listening model, which essentially inverts the features generated in a model back into an interpretable form like a sonogram. We demonstrate this method for a state-of-the-art singing voice detection model. We train up-convolutional neural networks to invert the feature generated at each layer of the model. The results suggest that the deepest fully connected layer of the model does not preserve temporal and harmonic structures, but that the inverted features from the deepest convolutional layer do. Moreover, a qualitative analysis of a large number of inputs suggests that the deepest layer in the model learns a decision function as the information it preserves depends on the class label associated with an input.},
   author = {S Mishra and B L Sturm and S Dixon},
   isbn = {9782954035123},
   journal = {Proceedings of the 19th International Society for Music Information Retrieval Conference, ISMIR 2018},
   month = {1},
   pages = {755-762},
   title = {Understanding a deep machine listening model through feature inversion},
   year = {2018},
}
@inproceedings{Mishra2018b,
   abstract = {© EURASIP 2018. Researchers have proposed methods to explain neural network predictions by building explanations either in terms of input components (e.g., pixels in an image) or in terms of input regions (e.g., the area containing the face of a Labrador). Such methods aim to determine the trustworthiness of a model, as well as to guide its improvement. In this paper, we argue that explanations in terms of input regions are useful for analysing machine listening systems. We introduce a novel method based on feature inversion to identify a region in an input time-frequency representation that is most influential to a prediction. We demonstrate it for a state-of-the-art singing voice detection model. We evaluate the quality of the generated explanations on two public benchmark datasets. The results demonstrate that the presented method often identifies a region of an input instance that has a decisive effect on the classification.},
   author = {S Mishra and B L Sturm and S Dixon},
   doi = {10.23919/EUSIPCO.2018.8553178},
   isbn = {9789082797015},
   issn = {2219-5491},
   journal = {European Signal Processing Conference},
   month = {11},
   pages = {2260-2264},
   title = {“What are you listening to?” Explaining predictions of deep machine listening systems},
   volume = {2018-September},
   year = {2018},
}
@inproceedings{Moffat2018,
   abstract = {© 2018 KASHYAP. Dynamic range compression (DRC) is a very commonly used audio effect. One use of DRC is to emphasise transients in an audio signal. The aim of this paper is to present an approach for automatically setting dynamic range compression timing parameters, adaptively, allowing parameters to adapt to the incoming audio signal, with the aim of emphasising transients within percussive audio tracks. An implementation approach is presented.},
   author = {D Moffat and M B Sandler},
   journal = {145th Audio Engineering Society International Convention, AES 2018},
   month = {1},
   title = {Adaptive ballistics control of dynamic range compression for percussive tracks},
   year = {2018},
}

@article{MOFFAT2018b,
   abstract = {Sound synthesis is the process of generating artificial sounds through some form of simulation or modelling. This article aims to identify which sound synthesis methods achieve the goal of producing a believable audio sample that may replace a recorded sound sample. A perceptual evaluation experiment of five different sound synthesis techniques was undertaken. Additive synthesis, statistical modelling synthesis with two different feature sets, physically inspired synthesis, concatenative synthesis, and sinusoidal modelling synthesis were all compared. Evaluation using eight different sound class stimuli and 66 different samples was undertaken. The additive synthesizer is the only synthesis method not considered significantly different from the reference sample across all sounds classes. The results demonstrate that sound synthesis can be considered as realistic as a recorded sample and makes recommendations for use of synthesis methods, given different sound class contexts.},
   author = {D J MOFFAT and J D REISS},
   doi = {10.1145/3165287},
   issue = {2},
   journal = {ACM Transactions on Applied Perception (TAP)},
   month = {4},
   publisher = {ACM},
   title = {Perceptual Evaluation of Synthesized Sound Effects},
   volume = {15},
   url = {https://dl.acm.org/citation.cfm?id=3165287},
   year = {2018},
}

@article{Morfi2018,
   author = {V Morfi and D Stowell},
   doi = {10.3390/app8081397},
   issn = {1454-5101},
   issue = {8},
   journal = {Applied Sciences},
   month = {8},
   publisher = {MDPI},
   title = {Deep Learning for Audio Event Detection and Tagging on Low-Resource Datasets},
   volume = {8},
   url = {http://arxiv.org/abs/1807.03697v2},
   year = {2018},
}
@article{MORREALE2018,
   author = {F MORREALE and J ARMITAGE and A MCPHERSON},
   doi = {10.3389/fpsyg.2018.02436},
   issn = {1664-1078},
   journal = {Frontiers in Psychology},
   month = {12},
   publisher = {Frontiers Media},
   title = {Effect of Instrument Structure Alterations on Violin Performance},
   year = {2018},
}
@inproceedings{Mycroft2018,
   author = {J Mycroft and T Stockman and J D Reiss},
   doi = {10.1145/3243274.3243290},
   isbn = {9781450366090},
   journal = {ACM International Conference Proceeding Series},
   month = {9},
   title = {A prototype mixer to improve cross-modal attention during audio mixing},
   year = {2018},
}
@inproceedings{Nakamura2018,
   abstract = {Most work on automatic transcription produces "piano roll" data with no musical interpretation of the rhythm or pitches. We present a polyphonic transcription method that converts a music audio signal into a human-readable musical score, by integrating multi-pitch detection and rhythm quantization methods. This integration is made difficult by the fact that the multi-pitch detection produces erroneous notes such as extra notes and introduces timing errors that are added to temporal deviations due to musical expression. Thus, we propose a rhythm quantization method that can remove extra notes by extending the metrical hidden Markov model and optimize the model parameters. We also improve the note-tracking process of multi-pitch detection by refining the treatment of repeated notes and adjustment of onset times. Finally, we propose evaluation measures for transcribed scores. Systematic evaluations on commonly used classical piano data show that these treatments improve the performance of transcription, which can be used as benchmarks for further studies.},
   author = {E Nakamura and E BENETOS and K Yoshii and S DIXON},
   month = {4},
   pages = {101-105},
   publisher = {IEEE},
   title = {Towards Complete Polyphonic Music Transcription: Integrating Multi-Pitch Detection and Rhythm Quantization},
   url = {https://2018.ieeeicassp.org/},
   year = {2018},
}
@inproceedings{Nolasco2018,
   abstract = {In this work, we aim to explore the potential of machine learning methods to the problem of beehive sound recognition. A major contribution of this work is the creation and release of annotations for a selection of beehive recordings. By experimenting with both support vector machines and convolutional neural networks, we explore important aspects to be considered in the development of beehive sound recognition systems using machine learning approaches.},
   author = {I Nolasco and E BENETOS},
   journal = {http://dcase.community/documents/workshop2018/proceedings/DCASE2018Workshop_Nolasco_131.pdf},
   month = {11},
   title = {To bee or not to bee: Investigating machine learning approaches for beehive sound recognition},
   url = {http://dcase.community/workshop2018},
   year = {2018},
}
@inproceedings{hanlon2018,
   abstract = {© 2018 IEEE. Onset detection is a fundamental task in musical signal processing, providing information for higher level applications. Different classes of onsets can be found in musical signals, determined as being hard, or soft, by the initial energy transfer. Most onset detectors are general purpose and attempt to detect both classes of onsets, although some specifically attempt to detect soft onsets. Temporal reassignment operators related to group delay have previously been employed in onset detectors for the purposes of soft onset detection and pruning of time-frequency elements deemed to consist of vibrato. We consider the use of temporal reassignment for the detection of hard onsets and also employ the second mixed derivative of phase as a means to prune the spectral energy. Experimental validation of the proposed approach is given, showing improvements relative to state-of-the-art general purpose onset detectors for the specific tasks.},
   author = {K O'Hanlon and M B Sandler},
   doi = {10.1109/ICASSP.2018.8461381},
   isbn = {9781538646588},
   issn = {1520-6149},
   journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
   month = {9},
   pages = {611-615},
   title = {Improved Detection of Semi-Percussive Onsets in Audio Using Temporal Reassignment},
   volume = {2018-April},
   year = {2018},
}
@article{PANTELI2018,
   abstract = {The comparison of world music cultures has been a recurring topic in the field of musicology since the end of the nineteenth century. Recent advances in technology in the field of Music Information Retrieval allow for a large-scale analysis of music corpora. We review manual and computational approaches in the literature that fall within the scope of music corpus research and world music analysis. With a large-scale computational music corpus analysis in mind, we compare the tools and research questions addressed by each study and discuss strengths and weaknesses. Taking into account critical remarks from experts in the field and challenges involved in a large-scale computational analysis, we discuss how this line of research can be improved in future work.},
   author = {M PANTELI and E BENETOS and S DIXON},
   doi = {10.1080/09298215.2017.1418896},
   issn = {0929-8215},
   issue = {2},
   journal = {Journal of New Music Research},
   month = {1},
   pages = {176-189},
   publisher = {Taylor & Francis (Routledge)},
   title = {A review of manual and computational approaches for the study of world music corpora},
   volume = {47},
   year = {2018},
}
@inproceedings{Pardue2018,
   abstract = {© Proceedings of the 15th Sound and Music Computing Conference: Sonic Crossings, SMC 2018. All rights reserved. This position paper introduces the concept of complexity management in instrument design as a means to optimize the learning rewards cycle in an effort to maintain player motivation. Successful fluency and expertise on an instrument requires sustained practice. In the quest to enable exceptional levels of expression, instruments designed for virtuosic performance often have a high level of complexity, which can be overwhelming for a beginner, decreasing practice motivation. Here we explain complexity management, the idea of intentionally limiting instrument complexity on a temporary basis so that instrument difficulty is optimally matched to user skill and users always remain capable of focused learning and enjoy sufficient musical success to motivate continued practice. We discuss the relevance of Csikszentmihalyi's ideas about flow, along with concepts from traditional music learning, such as chunking and internalization, along with the importance of practice and enjoyment. We then propose our own concept of learning efficiency and the importance of controlling challenge. Finally, we introduce our own experiments into complexity management using the violin, an existing example of an instrument with high input complexity. We discuss the effects of simplifying intonation in order to make early musical success easier along with plans for further investigations.},
   author = {L S Pardue and A McPherson and D Overholt},
   isbn = {9789963697304},
   journal = {Proceedings of the 15th Sound and Music Computing Conference: Sonic Crossings, SMC 2018},
   month = {1},
   pages = {150-157},
   title = {Improving the instrumental learning experience through complexity management},
   year = {2018},
}
@inproceedings{Pauwels2018,
   abstract = {A common problem in music education is finding varied and engaging material that is suitable for practising a specific musical concept or technique. At the same time, a number of large music collections are available under a Creative Commons (CC) licence (e.g. Jamendo, ccMixter), but their potential is largely untapped because of the relative obscurity of their content. In this paper, we present *Jam with Jamendo*, a web application that allows novice and expert learners of musical instruments to query songs by chord content from a large music collection, and practise the chords present in the retrieved songs by playing along. Its goal is twofold: the learners get a larger variety of practice material, while the artists receive increased exposure. We experimented with two visualisation modes. The first is a linear visualisation based on a moving time axis, the second is a circular visualisation inspired by the chromatic circle. We conducted a small-scale thinking-aloud user study with seven participants based on a hands-on practice with the web app. Through this pilot study, we obtained a qualitative understanding of the potentials and challenges of each visualisation, which will be used to inform the next design iteration of the web app.},
   author = {J Pauwels and A Xambó and G Roma and M Barthet and G Fazekas},
   journal = {Proceedings of the 4th Web Audio Conference (WAC)},
   month = {9},
   title = {Exploring Real-time Visualisations to Support Chord Learning with a Large Music Collection},
   year = {2018},
}
@inproceedings{Pauwels2018b,
   abstract = {Lately, a number of audio players based on web technology have made it possible for researchers to present their audio-related work in an attractive manner. Tools such as "wavesurfer.js", "waveform-playlist" and "trackswitch.js" provide highly-configurable players, allowing a more interactive exploration of scientific results that goes beyond simple linear playback.
However, the audio output to be presented is in many cases not generated by the same web technologies. The process of preparing audio data for display therefore requires manual intervention, in order to bridge the resulting gap between programming languages. While this is acceptable for one-time events, such as the preparation of final results, it prevents the usage of such players during the iterative development cycle. Having access to rich audio players already during development would allow researchers to get more instantaneous feedback. The current workflow consists of repeatedly importing audio into a digital audio workstation in order to achieve similar capabilities, a repetitive and time-consuming process.
In order to address these needs, we present "pywebaudioplayer", a Python package that automates the generation of code snippets for the each of the three aforementioned web audio players. It is aimed at use-cases where audio development in Python is combined with web visualisation. Notable examples are "Jupyter Notebook" and WSGI-compatible web frameworks such as "Flask" or "Django".},
   author = {J Pauwels and M Sandler},
   journal = {Proceedings of the 4th Web Audio Conference (WAC)},
   month = {9},
   title = {pywebaudioplayer: Bridging the gap between audio processing code and attractive visualisations based on web technology},
   year = {2018},
}
@inproceedings{Pauwels2018c,
   abstract = {Music learners looking for practice material to play along with are not served well by the current search interfaces for large music collections. While it is easy to find specific songs using meta-data or audio fingerprinting, discovering new music based on musical content is hard. In this paper, we'll look at the challenges that arise when creating a search interface that allows to query for songs based on chord content. Specifically, we'll discuss different ways of fulfilling queries and how imperfect chord transcriptions resulting from the automatic estimation process are handled.},
   author = {J Pauwels and G Fazekas and M Sandler},
   journal = {Proceedings of the 2018 Joint Workshop on Machine Learning for Music},
   month = {7},
   title = {Recommending songs to music learners based on chord content},
   year = {2018},
}
@book_section{Pearce2018,
   abstract = {© 2018, Springer-Verlag Berlin Heidelberg. Efforts to develop a formal characterization of musical structure are often framed in syntactic terms, sometimes but not always with direct inspiration from research on language. In Chap. 25, we present syntactic approaches to characterizing musical structure and survey a range of theoretical issues involved in developing formal syntactic theories of sequential structure in music. Such theories are often computational in nature, lending themselves to implementation and our first goal here is to review empirical research on computational modeling of musical structure from a syntactic point of view. We ask about the motivations for implementing a model and assess the range of approaches that have been taken to date. It is important to note that while a computational model may be capable of deriving an optimal structural description of a piece of music, human cognitive processing may not achieve this optimal performance, or may even process syntax in a different way. Therefore we emphasize the difference between developing an optimal model of syntactic processing and developing a model that simulates human syntactic processing. Furthermore, we argue that, while optimal models (e. g., optimal compression or prediction) can be useful as a benchmark or yardstick for assessing human performance, if we wish to understand human cognition then simulating human performance (including aspects that are nonoptimal or even erroneous) should be the priority. Following this principle, we survey research on processing of musical syntax from the perspective of computational modeling, experimental psychology and cognitive neuroscience. There exists a large number of computational models of musical syntax, but we limit ourselves to those that are explicitly cognitively motivated, assessing them in the context of theoretical, psychological and neuroscientific research.},
   author = {M Pearce and M Rohrmeier},
   doi = {10.1007/978-3-662-55004-5_26},
   issn = {2522-8692},
   journal = {Springer Handbooks},
   month = {1},
   pages = {487-505},
   title = {Musical Syntax II: Empirical Perspectives},
   year = {2018},
}
@article{PEARCE2018b,
   author = {M T PEARCE},
   doi = {10.1111/nyas.13654},
   issn = {0077-8923},
   journal = {Annals of the New York Academy of Sciences},
   month = {5},
   publisher = {Wiley},
   title = {Statistical Learning and Probabilistic Prediction in Music Cognition: Mechanisms of Stylistic Enculturation},
   year = {2018},
}
@inproceedings{Peng2018,
   abstract = {© 2018 KASHYAP. Studies have shown that listeners can distinguish between hot and cold water being poured based solely on sonic properties, yet the cause of this is unknown. This acoustic perception of temperature is an interesting aspect of multisensory perception and integration. In this paper, a series of experiments were performed to investigate the characteristics of auditory information when water is poured at different temperatures into various containers. Based on the results, it attempts to find physical and psychoacoustic explanations for the phenomenon.},
   author = {H Peng and J D Reiss},
   journal = {145th Audio Engineering Society International Convention, AES 2018},
   month = {1},
   title = {Why can you hear a difference between pouring hot and cold water? An investigation of temperature dependence in psychoacoustics},
   year = {2018},
}
@inproceedings{Pras2018,
   abstract = {© 2018 Audio Engineering Society. All Rights Reserved. While sound mixers of popular music may share common principles across cultures, different engineers produce different mixes, and different listeners judge a mix differently. We designed a mixed-methods approach to examine this highly multidimensional problem in both style and perceived quality. Five student sound engineers from the Paris Conservatoire mixed the multitrack source of two pop songs and fully documented their mixing process. The resulting mixes were then used as stimuli for a blind, multi-stimulus listening test in a high-quality listening room, that 13 students and one faculty member commented on and rated in terms of preference. Our outcomes highlight cultural and generational mixing specificities and offer a better understanding of the artistic side of the practice.},
   author = {A Pras and B De Man and J D Reiss},
   journal = {144th Audio Engineering Society Convention 2018},
   month = {1},
   title = {A case study of cultural influences on mixing practices},
   year = {2018},
}
@article{PURVER2018,
   author = {M R J PURVER and J HOUGH and C HOWES},
   doi = {10.1111/tops.12324},
   issn = {1756-8765},
   journal = {Topics in Cognitive Science},
   month = {3},
   publisher = {Wiley},
   title = {Computational Models of Miscommunication Phenomena},
   url = {http://www.eecs.qmul.ac.uk/~mpurver/papers/purver-et-al18topics.pdf},
   year = {2018},
}
@article{quiroga2018,
   abstract = {Abstract Theories of predictive processing propose that prediction error responses are modulated by the certainty of the predictive model or precision . While there is some evidence for this phenomenon in the visual and, to a lesser extent, the auditory modality, little is known about whether it operates in the complex auditory contexts of daily life. Here, we examined how prediction error responses behave in a more complex and ecologically valid auditory context than those typically studied. We created musical tone sequences with different degrees of pitch uncertainty to manipulate the precision of participants’ auditory expectations. Magnetoencephalography was used to measure the magnetic counterpart of the mismatch negativity (MMNm) as a neural marker of prediction error in a multi-feature paradigm. Pitch, slide, intensity and timbre deviants were included. We compared high-entropy stimuli, consisting of a set of non-repetitive melodies, with low-entropy stimuli consisting of a simple, repetitive pitch pattern. Pitch entropy was quantitatively assessed with an information-theoretic model of auditory expectation. We found a reduction in pitch and slide MMNm amplitudes in the high-entropy as compared to the low-entropy context. No significant differences were found for intensity and timbre MMNm amplitudes. Furthermore, in a separate behavioral experiment investigating the detection of pitch deviants, similar decreases were found for accuracy measures in response to more fine-grained increases in pitch entropy. Our results are consistent with a precision modulation of auditory prediction error in a musical context, and suggest that this effect is specific to features that depend on the manipulated dimension—pitch information, in this case. Highlights <jats:list list-type="bullet"><jats:list-item> The mismatch negativity (MMNm) is reduced in musical contexts with high pitch uncertainty <jats:list-item> The MMNm reduction is restricted to pitch-related features <jats:list-item> Accuracy during deviance detection is reduced in contexts with higher uncertainty <jats:list-item> The results suggest a feature-selective precision modulation of prediction error Materials, data and scripts can be found in the Open Science Framework repository: http://bit.ly/music_entropy_MMN DOI: 10.17605/OSF.IO/MY6TE},
   author = {D R Quiroga-Martinez and N C Hansen and A Højlund and M Pearce and E Brattico and P Vuust},
   doi = {10.1101/422949},
   journal = {bioRxiv},
   month = {9},
   publisher = {bioRxiv},
   title = {Reduced prediction error responses in high- as compared to low-uncertainty musical contexts},
   year = {2018},
}
@book_section{Rohrmeier2018,
   abstract = {© 2018, Springer-Verlag Berlin Heidelberg. The understanding of musical syntax is a topic of fundamental importance for systematic musicology and lies at the core intersection of music theory and analysis, music psychology, and computational modeling. This chapter discusses the notion of musical syntax and its potential foundations based on notions such as sequence grammaticality, expressive unboundedness, generative capacity, sequence compression and stability. Subsequently, it discusses problems concerning the choice of musical building blocks to be modeled as well as the underlying principles of sequential structure building. The remainder of the chapter reviews the main theoretical proposals that can be characterized under different mechanisms of structure building, in particular approaches using finite-context or finite-state models as well as tree-based models of context-free complexity (including the Generative Theory of Tonal Music) and beyond. The chapter concludes with a discussion of the main issues and questions driving current research and a preparation for the subsequent empirical chapter Musical Syntax II.},
   author = {M Rohrmeier and M Pearce},
   doi = {10.1007/978-3-662-55004-5_25},
   issn = {2522-8692},
   journal = {Springer Handbooks},
   month = {1},
   pages = {473-486},
   title = {Musical Syntax I: Theoretical Perspectives},
   year = {2018},
}
@article{Sears2018,
   author = {D R W Sears and M T Pearce and J Spitzer and W E Caplin and S McAdams},
   doi = {10.1177/1747021818814472},
   journal = {Q J Exp Psychol (Hove)},
   month = {11},
   pages = {1747021818814472–1747021818814472},
   title = {Expectations for tonal cadences: Sensory and cognitive priming effects.},
   url = {https://www.ncbi.nlm.nih.gov/pubmed/30404574},
   year = {2018},
}
@article{SELFRIDGE2018,
   abstract = {Aeroacoustics is a branch of engineering within fluid dynamics. It encompasses sounds generated by disturbances in air, either by an airflow being disturbed by an object or an object moving through air. A number of fundamental sound sources exist depending on the geometry of the interacting objects and the characteristics of the flow. An example of a fundamental aeroacoustic sound source is the Aeolian tone, generated by vortex shedding as air flows around an object. A compact source model of this sound is derived from fluid dynamics principles, operating in real-time and presenting highly relevant parameters to the user. A swinging sword, Aeolian harp and propeller are behaviour models presented to illustrate how a taxonomy of real-time aeroacoustic sound synthesis can be achieved through physical modelling. Evaluation indicates that the resulting sounds are perceptually as believable as sounds produced by other synthesis methods, while objective evaluations reveal similarities and differences between our models, pre-recorded samples and those generated by computationally complex offline methods.},
   author = {R SELFRIDGE and D MOFFAT and E AVITAL and J REISS},
   doi = {10.17743/jaes.2018.0033},
   issn = {1549-4950},
   issue = {7/8},
   journal = {Journal of the Audio Engineering Society},
   month = {8},
   pages = {594-607},
   publisher = {Audio Engineering Society},
   title = {Creating Real-Time Aeroacoustic Sound Effects Using Physically Informed Models},
   volume = {66},
   url = {http://www.aes.org/e-lib/browse.cfm?elib=19708},
   year = {2018},
}
@inproceedings{Selfridge2018b,
   abstract = {© 2018 Audio Engineering Society. All Rights Reserved. The edge tone is the sound generated when a planar jet of air from a nozzle comes into contact with a wedge and a number of physical conditions are met. Fluid dynamics equations were used to synthesise authentic edge tones without the need for complex computation. A real-time physically derived synthesis model was designed using the jet airspeed and nozzle exit-to-wedge geometry. We compare different theoretical equations used to predict the tone frequency. A decision tree derived from machine learning based on previously published experimental results was used to predict the correct mode of operation. Results showed an accurate implementation for mode selection, and highlighted areas where operation follows or deviates from previously published data.},
   author = {R Selfridge and J D Reiss and E J Avital},
   journal = {144th Audio Engineering Society Convention 2018},
   month = {1},
   title = {Physically derived synthesis model of an edge tone},
   year = {2018},
}
@inproceedings{Sheng2018,
   abstract = {This paper proposes a method of controlling the dynamic range compressor using sound examples. Our earlier work showed the effectiveness of random forest regression to map acoustic features to effect control parameters. We extend this work to address the challenging task of extracting relevant features when audio events overlap. We assess different audio decomposition approaches such as onset event detection, NMF, and transient/stationary audio separation using ISTA and compare feature extraction strategies for each case. Numerical and perceptual similarity tests show the utility of audio decomposition as well as specific features in the prediction of dynamic range compressor parameters.},
   author = {D Sheng and G Fazekas},
   journal = {Proc. of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), April 15-20, Calgary, Canada.},
   note = {date-added: 2018-05-06 23:33:10 +0000 date-modified: 2018-05-07 00:05:17 +0000 keywords: intelligent music production, ICASSP, intelligent audio effects local-url: sheng2018icassp.pdf},
   title = {Feature Design Using Audio Decomposition for Intelligent Control of the Dynamic Range Compressor},
   url = {https://2018.ieeeicassp.org/Papers/ViewPapers.asp?PaperNum=3048},
   year = {2018},
}
@inproceedings{Sheng2018b,
   abstract = {Casual users of audio effects may lack practical experience or knowledge of their low-level signal processing parameters. An intelligent control tool that allows using sound examples to control effects would strongly benefit these users. In a previous work we proposed a control method for the dynamic range compressor (DRC) using a random forest regression model. It maps audio features extracted from a reference sound to DRC parameter values, such that the processed signal resembles the reference. The key to good performance in this system is the relevance and effectiveness of audio features. This paper focusses on a thorough exposition and assessment of the features, as well as the comparison of different strategies to find the optimal feature set for DRC parameter estimation, using automatic feature selection methods. This enables us to draw conclusions about which features are relevant to core DRC parameters. Our results show that conventional time and frequency domain features well known from the literature are sufficient to estimate the DRC’s threshold and ratio parameters, while more specialized features are needed for attack and release time, which induce more subtle changes to the signal.},
   author = {D Sheng and G Fazekas},
   journal = {Proc. of the 144th Convention of the Audio Engineering Society, 23-26 May, Milan, Italy},
   note = {date-added: 2018-05-07 00:06:23 +0000 date-modified: 2018-05-07 00:09:42 +0000 keywords: feature selection,. intelligent music production, AES, intelligent audio effects local-url: sheng2018aes.pdf},
   title = {Feature Selection for Dynamic Range Compressor Parameter Estimation},
   url = {http://www.aes.org/events/144/papers/?ID=5993},
   year = {2018},
}
@inproceedings{SHUKLA2018,
   abstract = {If well-matched to a given listener, head-related transfer functions (HRTFs) that have not been individually measured can still present relatively effective auditory scenes compared to renderings from individualised HRTF sets. We present and assess a system for HRTF selection that relies on holistic judgements of users to identify their optimal match through a series of pairwise adversarial comparisons. The mechanism resulted in clear preference for a single HRTF set in a majority of cases. Where this did not occur, randomised selection between equally judged HRTFs did not significantly impact user performance in a subsequent listening task. This approach is shown to be equally effective for both novice and expert listeners in selecting their preferred HRTF set.},
   author = {R C SHUKLA and R L STEWART and A Roginska and M B SANDLER},
   city = {New York, NY, USA},
   journal = {http://www.aes.org/e-lib/inst/browse.cfm?elib=19677},
   month = {8},
   pages = {1-10},
   publisher = {Audio Engineering Society},
   title = {User Selection of Optimal HRTF Sets via Holistic Comparative Evaluation},
   url = {http://www.aes.org/e-lib/},
   year = {2018},
}
@inproceedings{SKACH2018,
   abstract = {This paper presents initial steps towards the design of an embedded system for body-centric sonic performance. The proposed prototyping system allows performers to manipulate sounds through gestural interactions captured by textile wearable sensors. The e-textile sensor data control, in real-time, audio synthesis algorithms working with content from Audio Commons, a novel web-based ecosystem for re-purposing crowd-sourced audio. The system enables creative embodied music interactions by combining seamless physical e-textiles with web-based digital audio technologies.},
   author = {S SKACH and A XAMBO and L TURCHET and A Stolfi and R L STEWART and M H E BARTHET},
   doi = {10.1145/3173225.3173272},
   month = {3},
   title = {Embodied Interactions with E-Textiles and the Internet of Sounds for Performing Arts},
   year = {2018},
}
@inproceedings{Skach2018b,
   abstract = {© 2018 Copyright held by the owner/author(s). Body posture is a good indicator of, amongst other things, people's state of arousal, focus of attention and level of interest in a conversation. Posture is conventionally measured by observation and hand coding of videos or, more recently, through automated computer vision and motion capture techniques. Here we introduce a novel alternative approach exploiting a new modality: posture classification using bespoke'smart' trousers with integrated textile pressure sensors. Changes in posture translate to changes in pressure patterns across the surface of our clothing. We describe the construction of the textile pressure sensor that can detect these changes. Using simple machine learning techniques on data gathered from 6 participants we demonstrate its ability to discriminate between 19 different basic posture types with high accuracy. This technology has the potential to support anonymous, unintrusive sensing of interest, attention and engagement in a wide variety of settings.},
   author = {S Skach and R Stewart and P G T Healey},
   doi = {10.1145/3242969.3242977},
   isbn = {9781450356923},
   journal = {ICMI 2018 - Proceedings of the 2018 International Conference on Multimodal Interaction},
   month = {10},
   pages = {116-124},
   title = {Smart ARSE: Posture classification with textile sensors in trousers},
   year = {2018},
}
@article{STOCKMAN2018,
   author = {A G STOCKMAN and D AL-THANI},
   doi = {10.1093/iwc/iwy017},
   journal = {Interacting With Computers},
   month = {9},
   title = {Evaluating an Interface for Cross-modal Information Seeking},
   year = {2018},
}
@inproceedings{STOCKMAN2018b,
   author = {A G STOCKMAN and O METATLA},
   doi = {10.1145/3173574.3174120},
   month = {4},
   title = {“I Hear You”: Understanding Awareness Information Exchange in an Audio-only Workspace},
   year = {2018},
}
@article{STOCKMAN2018c,
   author = {T STOCKMAN and S Wilkie},
   doi = {10.1016/j.apacoust.2017.12.032},
   issn = {1872-910X},
   journal = {Applied Acoustics},
   month = {1},
   publisher = {Elsevier},
   title = {Perception of objects that move in depth, using ecologically valid audio cues},
   year = {2018},
}
@article{Stolfi2018,
   abstract = {© 2018 Audio Engineering Society. All Rights Reserved. This paper analyzes communication patterns occurring in the online chat of the "Open Band" system for participatory live music performance. In addition, to act as a multi-user messaging tool, the chat system also serves as a control interface for the sonification of textual messages from the audience. Open Band performances were presented at various festivals and conferences since 2016. Its web-based platform enables collective "sound dialogues" that are open to everyone regardless of musical skills. Drawing on interactive participatory art and networked music performance, the system aims to provide engaging social experiences in colocated music-making situations. We collected data from four public performances including over 3,000 anonymous messages sent by audiences. We present the design of the system and then analyze the semantic content of messages using thematic and statistical analyses. Findings show how different sonification mechanisms alter the nature of the communication between participants who articulate between linguistic and musical self-expression.},
   author = {A Stolfi and J Sokolovskis and F Gorodscy and F Iazzetta and M Barthet},
   doi = {10.17743/jaes.2018.0048},
   issn = {1549-4950},
   issue = {11},
   journal = {AES: Journal of the Audio Engineering Society},
   month = {11},
   pages = {910-921},
   title = {Audio semantics: Online chat communication in open band participatory music performances},
   volume = {66},
   year = {2018},
}
@inproceedings{Stoller2018,
   abstract = {© 2018 IEEE. The state of the art in music source separation employs neural networks trained in a supervised fashion on multi-track databases to estimate the sources from a given mixture. With only few datasets available, often extensive data augmentation is used to combat overfitting. Mixing random tracks, however, can even reduce separation performance as instruments in real music are strongly correlated. The key concept in our approach is that source estimates of an optimal separator should be indistinguishable from real source signals. Based on this idea, we drive the separator towards outputs deemed as realistic by discriminator networks that are trained to tell apart real from separator samples. This way, we can also use unpaired source and mixture recordings without the drawbacks of creating unrealistic music mixtures. Our framework is widely applicable as it does not assume a specific network architecture or number of sources. To our knowledge, this is the first adoption of adversarial training for music source separation. In a prototype experiment for singing voice separation, separation performance increases with our approach compared to purely supervised training.},
   author = {D Stoller and S Ewert and S Dixon},
   doi = {10.1109/ICASSP.2018.8461722},
   isbn = {9781538646588},
   issn = {1520-6149},
   journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
   month = {9},
   pages = {2391-2395},
   title = {Adversarial Semi-Supervised Audio Source Separation Applied to Singing Voice Extraction},
   volume = {2018-April},
   year = {2018},
}
@inproceedings{Stoller2018b,
   abstract = {© 2018 IEEE. Existing music recordings are often rearranged, for example to fit their duration and structure to video content. Often an expert is needed to find suitable cut points allowing for imperceptible transitions between different sections. In previous work, the search for these cuts is restricted to the beginnings of beats or measures and only timbre and loudness are taken into account, while melodic expectations and instrument continuity are neglected. We instead aim to learn these features by training neural networks on a dataset of over 300 popular Western songs to classify which note onsets are suitable entry or exit points for a cut. We investigate existing and novel architectures and different feature representations, and find that best performance is achieved using neural networks with two-dimensional convolutions applied to spectrogram input covering several seconds of audio with a high temporal resolution of 23 or 46 ms. Finally, we analyse our best model using saliency maps and find it attends to rhythmical structures and the presence of sounds at the onset position, suggesting instrument activity to be important for predicting cut quality.},
   author = {D Stoller and V Akkermans and S Dixon},
   doi = {10.1109/MLSP.2018.8516706},
   isbn = {9781538654774},
   issn = {2161-0363},
   journal = {IEEE International Workshop on Machine Learning for Signal Processing, MLSP},
   month = {10},
   title = {Detection of cut-points for automatic music rearrangement},
   volume = {2018-September},
   year = {2018},
}
@inproceedings{STOLLER2018c,
   author = {D STOLLER and S Ewert and S DIXON},
   month = {6},
   title = {Jointly detecting and separating singing voice: a multi-task approach},
   year = {2018},
}
@article{Stowell2018,
   abstract = {Assessing the presence and abundance of birds is important for monitoring specific species as well as overall ecosystem health. Many birds are most readily detected by their sounds, and thus, passive acoustic monitoring is highly appropriate. Yet acoustic monitoring is often held back by practical limitations such as the need for manual configuration, reliance on example sound libraries, low accuracy, low robustness, and limited ability to generalise to novel acoustic conditions. Here, we report outcomes from a collaborative data challenge. We present new acoustic monitoring datasets, summarise the machine learning techniques proposed by challenge teams, conduct detailed performance evaluation, and discuss how such approaches to detection can be integrated into remote monitoring projects. Multiple methods were able to attain performance of around 88% area under the receiver operating characteristic (ROC) curve (AUC), much higher performance than previous general-purpose methods. With modern machine learning, including deep learning, general-purpose acoustic bird detection can achieve very high retrieval rates in remote monitoring data, with no manual recalibration, and no pretraining of the detector for the target species or the acoustic conditions in the target environment.},
   author = {D Stowell and M D Wood and H Pamuła and Y Stylianou and H Glotin},
   doi = {10.1111/2041-210X.13103},
   journal = {Methods in Ecology and Evolution},
   month = {11},
   title = {Automatic acoustic detection of birds through deep learning: The first Bird Audio Detection challenge},
   year = {2018},
}
@inproceedings{THALMANN2018,
   abstract = {We describe the concepts behind a web-based minimal-UI DJ system that adapts to the user’s preference via sim- ple interactive decisions and feedback on taste. Starting from a preset decision tree modeled on common DJ prac- tice, the system can gradually learn a more customised and user-specific tree. At the core of the system are structural representations of the musical content based on semantic au- dio technologies and inferred from features extracted from the audio directly in the browser. These representations are gradually combined into a representation of the mix which could then be saved and shared with other users. We show how different types of transitions can be modeled using sim- ple musical constraints. Potential applications of the system include crowd-sourced data collection, both on temporally aligned playlisting and musical preference.},
   author = {FLORIAN THALMANN and L THOMPSON and M SANDLER},
   month = {9},
   title = {A User-Adaptive Automated DJ Web App with Object-Based Audio and Crowd-Sourced Decision Trees},
   year = {2018},
}
@inproceedings{Thalmann2018b,
   abstract = {© 2018 Copyright held by the owner/author(s). We refine and unify our previous data model for describing and linking live music artefacts. In our model, physical and digital artefacts and recordings are treated as forms of cultural heritage which can all be aligned and distributed along the same event timeline. We show how our ontology maps to existing conceptual models and we evaluate it with a number of example queries as well as in practice, embedded in an online platform dedicated to the exploration of aggregated information documenting the live music events of a specific band.},
   author = {F Thalmann and T Wilmering and M B Sandler},
   doi = {10.1145/3243907.3243910},
   isbn = {9781450364959},
   journal = {ACM International Conference Proceeding Series},
   month = {10},
   pages = {1-5},
   title = {Cultural heritage documentation and exploration of live music events with linked data},
   year = {2018},
}
@article{Turchet2018,
   abstract = {© 2018 Audio Engineering Society. All rights reserved. Smart Instruments are a novel family of musical instruments that embed sensors, actuators, wireless connectivity, and semantic audio technologies. This paper reports the findings of a participatory design approach to develop a Smart Cajón, a box-shaped percussion instrument with Internet of Musical Things components. Five initial co-design sessions were conducted with different professional cajón player participants. The players were invited to devise tangible mock-ups by placing provided sensors on an acoustic cajón and to express desirable use cases and interactions. We then designed and implemented a prototype satisfying performers' common requirements. The prototype was assessed using the concurrent think-aloud protocol and semi-structured interviews. Overall, the smart qualities of the prototype and their potential received positive feedback, and areas of improvements related to expressive control and personalization were highlighted.},
   author = {L Turchet and A McPherson and M Barthet},
   doi = {10.17743/jaes.2018.0007},
   issn = {1549-4950},
   issue = {4},
   journal = {AES: Journal of the Audio Engineering Society},
   month = {4},
   pages = {220-230},
   title = {Co-design of a smart Cajón},
   volume = {66},
   year = {2018},
}
@article{TURCHET2018b,
   author = {L TURCHET and M BARTHET},
   doi = {10.1109/THMS.2018.2885408},
   issn = {2168-2291},
   journal = {IEEE Transactions on Human-Machine Systems},
   month = {12},
   publisher = {Institute of Electrical and Electronics Engineers},
   title = {Co-design of Musical Haptic Wearables for Electronic Music Performer's Communication},
   year = {2018},
}
@article{Turchet2018c,
   abstract = {© 2013 IEEE. The Internet of Musical Things (IoMusT) is an emerging research field positioned at the intersection of Internet of Things, new interfaces for musical expression, ubiquitous music, human-computer interaction, artificial intelligence, and participatory art. From a computer science perspective, IoMusT refers to the networks of computing devices embedded in physical objects (musical things) dedicated to the production and/or reception of musical content. Musical things, such as smart musical instruments or wearables, are connected by an infrastructure that enables multidirectional communication, both locally and remotely. We present a vision in which the IoMusT enables the connection of digital and physical domains by means of appropriate information and communication technologies, fostering novel musical applications and services. The ecosystems associated with the IoMusT include interoperable devices and services that connect musicians and audiences to support musician-musician, audience-musicians, and audience-audience interactions. In this paper, we first propose a vision for the IoMusT and its motivations. We then discuss five scenarios illustrating how the IoMusT could support: 1) augmented and immersive concert experiences; 2) audience participation; 3) remote rehearsals; 4) music e-learning; and 5) smart studio production. We identify key capabilities missing from today's systems and discuss the research needed to develop these capabilities across a set of interdisciplinary challenges. These encompass network communication (e.g., ultra-low latency and security), music information research (e.g., artificial intelligence for real-time audio content description and multimodal sensing), music interaction (e.g., distributed performance and music e-learning), as well as legal and responsible innovation aspects to ensure that future IoMusT services are socially desirable and undertaken in the public interest.},
   author = {L Turchet and C Fischione and G Essl and D Keller and M Barthet},
   doi = {10.1109/ACCESS.2018.2872625},
   journal = {IEEE Access},
   month = {9},
   pages = {61994-62017},
   title = {Internet of Musical Things: Vision and Challenges},
   volume = {6},
   year = {2018},
}
@inproceedings{Turchet2018d,
   abstract = {© 2018 FRUCT Oy. This paper presents an Internet of Musical Things ecosystem involving musicians and audiences interacting with a smart mandolin, smartphones, and the Audio Commons online repository Freesound. The ecosystem has been devised to support performer-instrument and performer-audience interactions through the generation of musical accompaniments exploiting crowd-sourced sounds. We present two use cases investigating how audio content retrieved from Freesound can be leveraged by performers or audiences to produce accompanying soundtracks for music performance with a smart mandolin. In the performer-instrument interaction use case, the performer can select content to be retrieved prior to performing through a set of keywords and structure it in order to create the desired accompaniment. In the performer-audience interaction use case, a group of audience members participates in the music creation by selecting and arranging Freesound audio content to create an accompaniment collaboratively. We discuss the advantages and limitations of the system with regard to music making and audience participation, along with its implications and challenges.},
   author = {L Turchet and M Barthet},
   doi = {10.23919/FRUCT.2018.8588110},
   isbn = {9789526865362},
   issn = {2305-7254},
   journal = {Conference of Open Innovation Association, FRUCT},
   month = {12},
   pages = {375-381},
   title = {Jamming with a Smart Mandolin and Freesound-based Accompaniment},
   volume = {2018-November},
   year = {2018},
}
@article{Turchet2018e,
   abstract = {© 2018 Turchet, McPherson and Barthet. Smartmusical instruments are a class of IoT devices formusicmaking, which encompass embedded intelligence as well as wireless connectivity. In previous work, we established design requirements for a novel smart musical instrument, a smart cajón, following a user-centered approach. This paper describes the implementation and technical evaluation of the designed component of the smart cajón related to hit classification and repurposing. A conventional acoustic cajón was enhanced with sensors to classify position of the hit and the gesture that produced it. The instrument was equipped with five piezo pickups attached to the internal panels and a condenser microphone located inside. The developed sound engine leveraged digital signal processing, sensor fusion, and machine learning techniques to classify the position, dynamics, and timbre of each hit. The techniques were devised and implemented to achieve low latency between action and the electronically-generated sounds, as well as keep computational efficiency high. The system was tuned to classify two main cajón playing techniques at different locations and we conducted evaluations using over 2,000 hits performed by two professional players. We first assessed the classification performance when training and testing data related to recordings fromthe same player. In this configuration, classification accuracies of 100% were obtained for hit detection and location. Accuracies of over 90% were obtained when classifying timbres produced by the two playing techniques. We then assessed the classifier in a cross-player configuration (training and testing were performed using recordings from different players). Results indicated that while hit location scales relatively well across different players, gesture identification requires that the involved classifiers are trained specifically for each musician.},
   author = {L Turchet and A McPherson and M Barthet},
   doi = {10.3389/fict.2018.00016},
   journal = {Frontiers in ICT},
   month = {1},
   title = {Real-time hit classification in a smart cajón},
   volume = {5},
   year = {2018},
}
@inproceedings{Turchet2018f,
   abstract = {© 2018 FRUCT Oy. The Internet of Musical Things is an emerging research area that relates to the network of Musical Things, which are computing devices embedded in physical objects dedicated to the production and/or reception of musical content. In this paper we propose a semantically-enriched Internet of Musical Things architecture which relies on a semantic audio server and edge computing techniques. Specifically, a SPARQL Event Processing Architecture is employed as an interoperability enabler allowing multiple heterogeneous Musical Things to cooperate, relying on a music-related ontology. We technically validate our architecture by implementing an ecosystem around it, where five Musical Thing prototypes communicate between each other.},
   author = {L Turchet and F Viola and G Fazekas and M Barthet},
   doi = {10.23919/FRUCT.2018.8587917},
   isbn = {9789526865362},
   issn = {2305-7254},
   journal = {Conference of Open Innovation Association, FRUCT},
   month = {12},
   pages = {382-390},
   title = {Towards a Semantic Architecture for the Internet of Musical Things},
   volume = {2018-November},
   year = {2018},
}
@article{valero2018b,
   abstract = {In the field of Automatic Music Transcription, note tracking systems constitute a key process in the overall success of the task as they compute the expected note-level abstraction out of a frame-based pitch activation representation. Despite its relevance, note tracking is most commonly performed using a set of hand-crafted rules adjusted in a manual fashion for the data at issue. In this regard, the present work introduces an approach based on machine learning, and more precisely supervised classification, that aims at automatically inferring such policies for the case of piano music. The idea is to segment each pitch band of a frame-based pitch activation into single instances which are subsequently classified as active or non-active note events. Results using a comprehensive set of supervised classification strategies on the MAPS piano data-set report its competitiveness against other commonly considered strategies for note tracking as well as an improvement of more than 10% in terms of F-measure when compared to the baseline considered for both frame-level and note-level evaluations.},
   author = {J J Valero-Mas and E BENETOS and J M Iñesta},
   doi = {10.1080/09298215.2018.1451546},
   issn = {0929-8215},
   issue = {3},
   journal = {Journal of New Music Research},
   month = {6},
   pages = {249-263},
   publisher = {Taylor & Francis (Routledge)},
   title = {A Supervised Classification Approach for Note Tracking in Polyphonic Piano Transcription},
   volume = {47},
   year = {2018},
}
@inproceedings{Viola2018,
   abstract = {© 2018 FRUCT Oy. Semantic Web technologies are increasingly used in the Internet of Things due to their intrinsic propensity to foster interoperability among heterogenous devices and services. However, some of the IoT application domains have strict requirements in terms of timeliness of the exchanged messages, latency and support for constrained devices. An example of these domains is represented by the emerging area of the Internet of Musical Things. In this paper we propose C Minor, a CoAP-based semantic publish/subscribe broker specifically designed to meet the requirements of Internet of Musical Things applications, but relevant for any IoT scenario. We assess its validity through a practical use case.},
   author = {F Viola and L Turchet and F Antoniazzi and G Fazekas},
   doi = {10.23919/FRUCT.2018.8588087},
   isbn = {9789526865362},
   issn = {2305-7254},
   journal = {Conference of Open Innovation Association, FRUCT},
   month = {12},
   pages = {405-415},
   title = {C Minor: A Semantic Publish/Subscribe Broker for the Internet of Musical Things},
   volume = {2018-November},
   year = {2018},
}
@inproceedings{Viola2018b,
   abstract = {© 2018 Association for Computing Machinery. Playsound is a simple and intuitive web-based tool for music composition based on sounds from Freesound, an online repository of diverse audio content with Creative Commons licenses. In this paper, we present an approach based on Semantic Web technologies to provide recommendations to Playsound users. A Semantic Web of Things architecture is outlined, showing loosely coupled, independent software agents interoperating by means of a semantic publish/subscribe platform and a set of ontologies to describe agents, audio contents, input/output of audio analytics tools and recommendations. Preliminary tests confirm that the designed architecture adapts well to environments where services can be discovered and seamlessly orchestrated on the fly, resulting in a dynamic workflow.},
   author = {F Viola and A Stolfi and A Milo and M Ceriani and M Barthet and G Fazekas},
   doi = {10.1145/3243907.3243908},
   isbn = {9781450364959},
   journal = {ACM International Conference Proceeding Series},
   month = {10},
   pages = {46-53},
   title = {Playsound.space: Enhancing a live music performance tool with semantic recommendations},
   year = {2018},
}
@inproceedings{WANG2018,
   abstract = {Playing techniques such as ornamentations and articulation effects constitute important aspects of music performance. However, their computational analysis is still under-explored due to a lack of data and established methods. Focusing on the Chinese bamboo flute, we introduce a two-stage glissando detection system based on hidden Markov models (HMMs) with Gaussian mixtures. A rule-based segmentation process extracts glissando candidates that are consecutive note changes in the same direction. Glissandi are then identified by two HMMs (glissando and non-glissando). The study uses a newly created dataset of Chinese bamboo flute recordings. The results, based on both frame- and segment-based evaluation, achieve F-measures of 78% and 73% for ascending glissandi, and 65% and 72% for descending glissandi, respectively. The dataset and method can be used for performance analysis.},
   author = {C WANG and E BENETOS and X MENG and E CHEW},
   journal = {http://ismir2018.ircam.fr/pages/events-lbd.html},
   month = {9},
   title = {Towards HMM-based glissando detection for recordings of Chinese bamboo flute},
   year = {2018},
}
@inproceedings{Weaver2018,
   abstract = {© 2018 KASHYAP. Acoustic design of performance spaces typically places the performers' needs relatively low in the hierarchy of requirements in comparison to the quality of sound for an audience. While there are a number of studies relating to solo performers' and symphony orchestras' preferred acoustic environments, there is a paucity of literature on objective measurements of the impact of acoustic spaces on smaller ensembles. This study aims to bridge this gap by building a methodology for the analysis of changes in ensemble musical expression caused by different acoustic environments. The study finds that there is an association between reverberation time and global tempo for a colocated piano duo and provides a basis for future studies in this area. This work extends previous research in the area of acoustics and musical performance.},
   author = {J Weaver and M Barthet and E Chew},
   journal = {145th Audio Engineering Society International Convention, AES 2018},
   month = {1},
   title = {Analysis of piano duo tempo changes in varying convolution reverberation conditions},
   year = {2018},
}
@inproceedings{Wilkinson2018,
   abstract = {Generative models based on subband amplitude envelopes of natural sounds have resulted in convincing synthesis, showing subband amplitude modulation to be a crucial component of auditory perception. Probabilistic latent variable analysis can be particularly insightful, but existing approaches don’t incorporate prior knowledge about the physical behaviour of amplitude envelopes, such as exponential decay or feedback. We use latent force modelling, a probabilistic learning paradigm that encodes physical knowledge into Gaussian process regression, to model correlation across spectral subband envelopes. We augment the standard latent force model approach by explicitly modelling dependencies across multiple time steps. Incorporating this prior knowledge strengthens the interpretation of the latent functions as the source that generated the signal. We examine this interpretation via an experiment showing that sounds generated by sampling from our probabilistic model are perceived to be more realistic than those generated by comparative models based on nonnegative matrix factorisation, even in cases where our model is outperformed from a reconstruction error perspective.},
   author = {W J Wilkinson and J D Reiss and D Stowell},
   doi = {10.1007/978-3-319-93764-9_25},
   isbn = {9783319937632},
   issn = {0302-9743},
   journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
   month = {6},
   pages = {259-269},
   title = {A generative model for natural sounds based on latent force modelling},
   volume = {10891 LNCS},
   year = {2018},
}
@inproceedings{Wilmering2018,
   abstract = {© 2018 CEUR-WS. All rights reserved. With the increasing importance attributed to intangible cultural heritage, of which music performance is an important part, public archive collections contain a growing proportion of audio and video material. Currently used models have only limited capabilities for their representation. This demo illustrates our proposal for a unified ontological model of live music recordings and associated tangible artefacts with a Web application for the exploration of live music events of the Grateful Dead.},
   author = {T Wilmering and F Thalmann and M B Sandler},
   issn = {1613-0073},
   journal = {CEUR Workshop Proceedings},
   month = {10},
   title = {Exploration of grateful dead concerts and memorabilia on the semantic Web},
   volume = {2180},
   year = {2018},
}
@article{Wu2018,
   abstract = {© 2018 Creative engagement with novel musical interfaces can be rewarding for non-musicians. However, designing novel musical interfaces for non-musicians can be challenging because they lack conceptual and technical musical skills. In this paper we explore the effects of task motivation (experiential goal vs utilitarian goal) and user interface mode (whether the content is editable, and whether content can be replayed), on non-musicians’ creative engagement with novel musical interfaces. We show through an empirical study of twenty-four participants that an experiential exploratory goal encourages users’ creative engagement compared to a utilitarian creative goal. We found that being able to replay records is less important when participants have an experiential exploratory goal than when they have a utilitarian creative goal. Results also indicate that allowing people to replay their musical ideas increased some aspects of their creative engagement which was further increased when they were able to edit their creations. We also found that creative engagement increased when the interface supported users in planning ahead. A descriptive model of non-musician's creative engagement with musical interfaces is proposed including three modes of musicking. An optimal trajectory of creative engagement through these modes is suggested and a description of inferred motivations, output, status and activities during creative processes is discussed. Design implications are proposed for supporting novices’ creative engagement taking into consideration their motivation and skills, and supporting insight and real-time activity.},
   author = {Y Wu and N Bryan-Kinns},
   doi = {10.1016/j.ijhcs.2018.07.009},
   issn = {1071-5819},
   journal = {International Journal of Human Computer Studies},
   month = {8},
   pages = {61-77},
   title = {Musicking with an interactive musical system: The effects of task motivation and user interface mode on non-musicians’ creative engagement},
   volume = {122},
   year = {2018},
}
@inproceedings{Xambo2018,
   abstract = {The recent increase in the accessibility and size of personal and crowdsourced digital sound collections brought about a valuable resource for music creation. Finding and retrieving relevant sounds in performance leads to challenges that can be approached using music information retrieval (MIR). In this paper, we explore the use of MIR to retrieve and repurpose sounds in musical live coding. We present a live coding system built on SuperCollider enabling the use of audio content from online Creative Commons (CC) sound databases such as Freesound or personal sound databases. The novelty of our approach lies in exploiting high-level MIR methods (e.g., query by pitch or rhythmic cues) using live coding techniques applied to sounds. We demonstrate its potential through the reflection of an illustrative case study and the feedback from four expert users. The users tried the system with either a personal database or a crowdsourced database and reported its potential in facilitating tailorability of the tool to their own creative workflows.},
   author = {A Xambo and G Roma and A Lerch and M Barthet and G Fazekas},
   journal = {Proc. of the New Interfaces for Musical Expression (NIME), 3-6 June, Blacksburg, VA, USA.},
   note = {date-added: 2018-05-07 00:22:07 +0000 date-modified: 2018-05-07 00:28:09 +0000 keywords: live coding, MIR, sound samples, Creative Commons},
   title = {Live Repurposing of Sounds: MIR Explorations with Personal and Crowd-sourced Databases},
   year = {2018},
}
@inproceedings{xambo2018b,
   abstract = {© 2018 Association for Computing Machinery. Nowadays, a number of online music databases are available under Creative Commons licenses (e.g. Jamendo, ccMixter). Typically, it is possible to navigate and play their content through search interfaces based on metadata and file-wide tags. However, because this music is largely unknown, additional methods of discovery need to be explored. In this paper, we focus on a use case for music learners. We present a web app prototype that allows novice and expert musicians to discover songs in Jamendo's music collection by specifying a set of chords. Its purpose is to provide a more pleasurable practice experience by suggesting novel songs to play along with, instead of practising isolated chords or with the same song over and over again. To handle less chord-oriented songs and transcription errors that inevitably arise from the automatic chord estimation used to populate the database, query results are ranked according to a computational confidence measure. In order to assess the validity of the confidence ranked system, we conducted a small pilot user study to assess its usefulness. Drawing on those preliminary findings, we identify some design recommendations for future applications of music learning and music search engines focusing on the user experience when interacting with sound.},
   author = {A Xambó and J Pauwels and G Roma and M Barthet and G Fazekas},
   doi = {10.1145/3243274.3243291},
   isbn = {9781450366090},
   journal = {ACM International Conference Proceeding Series},
   month = {9},
   title = {Jam with Jamendo: Querying a large music collection by chords from a learner's perspective},
   year = {2018},
}
@inproceedings{YCART2018,
   abstract = {The MAPS dataset is the most used benchmark dataset for automatic music transcription (AMT). We propose here an updated version of the ground truth, containing precise beat, time signature, and key signature annotations.},
   author = {A YCART and E BENETOS},
   month = {9},
   title = {A-MAPS: Augmented MAPS Dataset with Rhythm and Key Annotations},
   year = {2018},
}
@inproceedings{YCART2018b,
   author = {A YCART and E BENETOS},
   month = {4},
   pages = {386-390},
   publisher = {IEEE},
   title = {Polyphonic Music Sequence Transduction with Meter-Constrained LSTM Networks},
   year = {2018},
}
@inproceedings{Yela2018,
   abstract = {© 2018 IEEE. A major goal in blind source separation to identify and separate sources is to model their inherent characteristics. While most state-of-the-art approaches are supervised methods trained on large datasets, interest in non-data-driven approaches such as Kernel Additive Modelling (KAM) remains high due to their interpretability and adaptability. KAM performs the separation of a given source applying robust statistics on the time-frequency bins selected by a source-specific kernel function, commonly the K-NN function. This choice assumes that the source of interest repeats in both time and frequency. In practice, this assumption does not always hold. Therefore, we introduce a shift-invariant kernel function capable of identifying similar spectral content even under frequency shifts. This way, we can considerably increase the amount of suitable sound material available to the robust statistics. While this leads to an increase in separation performance, a basic formulation, however, is computationally expensive. Therefore, we additionally present acceleration techniques that lower the overall computational complexity.},
   author = {D F Yela and S Ewert and K O'Hanlon and M B Sandler},
   doi = {10.1109/ICASSP.2018.8461801},
   isbn = {9781538646588},
   issn = {1520-6149},
   journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
   month = {9},
   pages = {616-620},
   title = {Shift-Invariant Kernel Additive Modelling for Audio Source Separation},
   volume = {2018-April},
   year = {2018},
}
@article{ZAPPI2018,
   author = {V ZAPPI and A MCPHERSON},
   doi = {10.3389/fict.2018.00026},
   issue = {26},
   journal = {Frontiers in ICT},
   month = {10},
   title = {Hackable Instruments: Supporting Appropriation and Modification in Digital Musical Interaction},
   volume = {5},
   year = {2018},
}
@inproceedings{Zhang2018,
   author = {L Zhang and P G T Healey},
   doi = {10.1145/3242969.3242998},
   month = {1},
   pages = {428-436},
   title = {Human, Chameleon or Nodding Dog?},
   year = {2018},
}